## Parsing XML

https://docs.python.org/3/library/xml.etree.elementtree.html

In [3]:
xml = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>"""

In [5]:
import xml.etree.ElementTree as ET

tree = ET.parse('country_data.xml')  # or ET.fromstring(xml)
root = tree.getroot()

In [10]:
type(root), root.tag, root.attrib

(xml.etree.ElementTree.Element, 'data', {})

In [8]:
for child in root:
    print(child.tag, child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [9]:
root[0][1].text

'2008'

## Finding interesting elements¶

Element.iter() 는 재귀적으로 하위 트리까지 모두 탐색한다.

In [11]:
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [12]:
root[0][1].text

'2008'

텍스트 정보가 있다면 표시함

In [13]:
root[0].text

'\n        '

In [15]:
root.childNodes

AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'childNodes'

In [16]:
type(root)

xml.etree.ElementTree.Element

## DOM

In [71]:
from xml.dom.minidom import parse, parseString

In [136]:
dom = parse(open('book.xml'))

In [36]:
type(dom)

xml.dom.minidom.Document

In [37]:
dom.childNodes[0].nodeName

'booklist'

In [38]:
booklist = dom.childNodes

In [39]:
booklist[0].nodeName

'booklist'

## 노드 타입

ELEMENT_CODE = 1

ATTRIBUTE_NODE = 2

TEXT_NODE = 3

CDATA_SECTION_NODE = 4

ENTITY_NODE = 5

DOCUMENT_TYPE_NODE = 9
...



최상위 루트는 DOCUMETN 타입니다. 루트의 자손에 booklist 가 있다.

booklist 의 자손에는 여러 책이 들어 있다.

이 책들에 대한 노드는 다음과 같다.

books = dom.childNodes[0].childNodes

책의 노트 타입은 ELEMENT 다.

In [73]:
dom.nodeName, dom.nodeType

('#document', 9)

In [81]:
books = dom.childNodes[0].childNodes
books[1].nodeName, books[1].nodeType

('book', 1)

In [76]:
for book in books:
    print(book.nodeType, book.nodeName)
    if book.nodeType == 3:
        pass
        #print('\t', book.nodeValue)

3 #text
1 book
3 #text
1 book
3 #text
1 book
3 #text


루트로부터 두 단계 아래인 book 까지는 child 노드를 두번 사용하고 노드 타입이 3인 엘리먼트를 선택해서 도달할 수 있었다.

book 엘리먼트가 가지고 있는 정보를 탐색해보겠다.

In [115]:
for book in books:
    if book.nodeName == 'book':
        for item in book.childNodes:
            if item.nodeName == 'title':
                print(item.tagName, ' : ',item.firstChild.nodeValue)
        

title  :  The Very Hungry Caterpillar Pop-Up Book
title  :  The Shack
title  :  You Can Negotiate Anything


In [142]:
print(dom.getElementsByTagName("title")[0].childNodes[0].data)
print(books[1].getElementsByTagName("title")[0].childNodes[0].data)

                
        
        

The Very Hungry Caterpillar Pop-Up Book
The Very Hungry Caterpillar Pop-Up Book


In [103]:
books[1].childNodes[1].firstChild.nodeValue

'The Very Hungry Caterpillar Pop-Up Book'

In [147]:
books[1].attributes, type(books[1].attributes)

(<xml.dom.minidom.NamedNodeMap at 0x49d77c8>, xml.dom.minidom.NamedNodeMap)

In [156]:
from pm.pprint import show_attr
show_attr(books[1].attributes)

TypeError: 'bool' object is not callable

In [149]:
books[1].attributes.values

<bound method NamedNodeMap.values of <xml.dom.minidom.NamedNodeMap object at 0x0000000005856E48>>

## DOM 

https://docs.python.org/3.0/library/xml.dom.minidom.html

In [122]:
dom3 = parseString("<myxml>Some data</myxml>")

dom3.documentElement.firstChild.nodeValue, dom3.documentElement.firstChild.data

('Some data', 'Some data')

In [118]:
dom3.documentElement.tagName, dom3.documentElement.nodeName

('myxml', 'myxml')

'Some data'

## DOM Example

In [117]:
import xml.dom.minidom

document = """\
<slideshow>
<title>Demo slideshow</title>
<slide><title>Slide title</title>
<point>This is a demo</point>
<point>Of a program for processing slides</point>
</slide>

<slide><title>Another demo slide</title>
<point>It is important</point>
<point>To have more than</point>
<point>one slide</point>
</slide>
</slideshow>
"""

dom = xml.dom.minidom.parseString(document)

def getText(nodelist):
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

def handleSlideshow(slideshow):
    print("<html>")
    handleSlideshowTitle(slideshow.getElementsByTagName("title")[0])
    slides = slideshow.getElementsByTagName("slide")
    handleToc(slides)
    handleSlides(slides)
    print("</html>")

def handleSlides(slides):
    for slide in slides:
        handleSlide(slide)

def handleSlide(slide):
    handleSlideTitle(slide.getElementsByTagName("title")[0])
    handlePoints(slide.getElementsByTagName("point"))

def handleSlideshowTitle(title):
    print("<title>%s</title>" % getText(title.childNodes))

def handleSlideTitle(title):
    print("<h2>%s</h2>" % getText(title.childNodes))

def handlePoints(points):
    print("<ul>")
    for point in points:
        handlePoint(point)
    print("</ul>")

def handlePoint(point):
    print("<li>%s</li>" % getText(point.childNodes))

def handleToc(slides):
    for slide in slides:
        title = slide.getElementsByTagName("title")[0]
        print("<p>%s</p>" % getText(title.childNodes))

handleSlideshow(dom)

<html>
<title>Demo slideshow</title>
<p>Slide title</p>
<p>Another demo slide</p>
<h2>Slide title</h2>
<ul>
<li>This is a demo</li>
<li>Of a program for processing slides</li>
</ul>
<h2>Another demo slide</h2>
<ul>
<li>It is important</li>
<li>To have more than</li>
<li>one slide</li>
</ul>
</html>


## 도서관리목록 디버깅

In [179]:
dom = parse(open('book.xml'))

In [180]:
title_elements = dom.getElementsByTagName('title')

In [181]:
for elem in title_elements:
    print(elem.nodeType)
    print(elem.firstChild.data)

1
The Very Hungry Caterpillar Pop-Up Book
1
The Shack
1
You Can Negotiate Anything


In [182]:
author_elements = dom.getElementsByTagName('author')
for elem in author_elements:
    print(elem.nodeType)
    print(elem.firstChild.data)

1


AttributeError: 'NoneType' object has no attribute 'data'

## DOM 에 내용 추가하기

In [188]:
dom = parse(open('book.xml'))
newBook = dom.createElement('book')
newBook.setAttribute('ISBN','111111111111111111111111111111')

titleElem = dom.createElement('title')
titleNode = dom.createTextNode('ninja go')
titleElem.appendChild(titleNode)

newBook.appendChild(titleElem)


dom.firstChild.appendChild(newBook)

print(dom.toxml())


<?xml version="1.0" ?><booklist cnt="3">
 <book ISBN="0399250395">
       <title>The Very Hungry Caterpillar Pop-Up Book</title>
       <author name="Eric Carle"/>
 <author name="Keith Finch"/>
 <publisher> Philomel Books</publisher>
 <description> Celebrating the 40th anniverary of one of the most popular children's books ever created</description>
 </book>
 <book ISBN="0964729237">
    <title lang="english">The Shack</title>
 </book>
 <book ISBN="0553281097">
    <title>You Can Negotiate Anything</title>
    <author name="Herb Cohen"/>
    <category cid="12">Negotiate and narrative skill</category>
 </book>
<book ISBN="111111111111111111111111111111"><title>ninja go</title></book></booklist>
