In [43]:
# import data by reading from a file:
import xml.etree.ElementTree as ET
tree = ET.parse('running-example.xml')
# tree = ET.ElementTree(file = 'running-example.xes')
root = tree.getroot()

In [75]:
# As an Element, root has a tag and a dictionary of attributes:
print(root.tag)
print(root.attrib)

{http://code.deckfour.org/xes}log
{'xes.creator': 'Fluxicon Nitro', 'xes.version': '1.0'}


In [76]:
# It also has children nodes over which we can iterate:
for child in root:
    print(child.tag)

{http://code.deckfour.org/xes}extension
{http://code.deckfour.org/xes}extension
{http://code.deckfour.org/xes}extension
{http://code.deckfour.org/xes}global
{http://code.deckfour.org/xes}global
{http://code.deckfour.org/xes}classifier
{http://code.deckfour.org/xes}classifier
{http://code.deckfour.org/xes}string
{http://code.deckfour.org/xes}trace
{http://code.deckfour.org/xes}trace
{http://code.deckfour.org/xes}trace
{http://code.deckfour.org/xes}trace
{http://code.deckfour.org/xes}trace
{http://code.deckfour.org/xes}trace


In [77]:
for child in root:
    print(child.attrib)

{'prefix': 'concept', 'name': 'Concept', 'uri': 'http://code.deckfour.org/xes/concept.xesext'}
{'prefix': 'time', 'name': 'Time', 'uri': 'http://code.deckfour.org/xes/time.xesext'}
{'prefix': 'org', 'name': 'Organizational', 'uri': 'http://code.deckfour.org/xes/org.xesext'}
{'scope': 'trace'}
{'scope': 'event'}
{'name': 'Activity', 'keys': 'Activity'}
{'name': 'activity classifier', 'keys': 'Activity'}
{'value': 'Fluxicon Nitro', 'key': 'creator'}
{}
{}
{}
{}
{}
{}


In [86]:
# access specific child nodes by index:
root[8][0].text # <...>text<...> 此文件没有

In [28]:
root[0].tag

'{http://code.deckfour.org/xes}extension'

In [10]:
root[8][1]

<Element '{http://code.deckfour.org/xes}string' at 0x7f9e783a96d8>

In [12]:
root[8][2][2]

<Element '{http://code.deckfour.org/xes}date' at 0x7f9e783a9868>

In [48]:
# finding interesting elements:###############
for elem in root.iter('date'):
    print(elem.tag, elem.attrib)

In [45]:
# 在 log 下面找到所有标签为 trace 的元素:################
for elem in tree.iterfind('log/trace'):
    print(elem.tag, elem.attrib)

In [46]:
# 就像任何 Element 一样，根节点可以找到自己的子结点：
for children in root[8][2]:
    print(children.tag, children.attrib)

{http://code.deckfour.org/xes}string {'key': 'concept:name', 'value': 'register request'}
{http://code.deckfour.org/xes}string {'key': 'org:resource', 'value': 'Pete'}
{http://code.deckfour.org/xes}date {'key': 'time:timestamp', 'value': '2010-12-30T14:32:00.000+01:00'}
{http://code.deckfour.org/xes}string {'key': 'Activity', 'value': 'register request'}
{http://code.deckfour.org/xes}string {'key': 'Resource', 'value': 'Pete'}
{http://code.deckfour.org/xes}string {'key': 'Costs', 'value': '50'}


In [47]:
# iter 方法可以对子结点进行深度优先遍历
for elem in tree.iter():
    print(elem.tag, elem.attrib)

{http://code.deckfour.org/xes}log {'xes.version': '1.0', 'xes.creator': 'Fluxicon Nitro'}
{http://code.deckfour.org/xes}extension {'uri': 'http://code.deckfour.org/xes/concept.xesext', 'prefix': 'concept', 'name': 'Concept'}
{http://code.deckfour.org/xes}extension {'uri': 'http://code.deckfour.org/xes/time.xesext', 'prefix': 'time', 'name': 'Time'}
{http://code.deckfour.org/xes}extension {'uri': 'http://code.deckfour.org/xes/org.xesext', 'prefix': 'org', 'name': 'Organizational'}
{http://code.deckfour.org/xes}global {'scope': 'trace'}
{http://code.deckfour.org/xes}string {'key': 'concept:name', 'value': 'name'}
{http://code.deckfour.org/xes}global {'scope': 'event'}
{http://code.deckfour.org/xes}string {'key': 'concept:name', 'value': 'name'}
{http://code.deckfour.org/xes}string {'key': 'org:resource', 'value': 'resource'}
{http://code.deckfour.org/xes}date {'key': 'time:timestamp', 'value': '2011-04-13T14:02:31.199+02:00'}
{http://code.deckfour.org/xes}string {'key': 'Activity', 'valu

Element 有一些关于寻找的方法可以接受 XPath 作为参数。 
find 返回第一个匹配的子元素， 
findall 以列表的形式返回所有匹配的子元素， 
iterfind 为所有匹配项提供迭代器。

In [38]:
# 给出如何找到所有的 movie 元素，用一个指定 title 的状态：
for elem in tree.iterfind('movie[@title="Enemy Behind"]'):
    print(elem.tag, elem.attrib)

movie {'title': 'Enemy Behind'}


In [4]:
# 读取xml文档，修改，再将结果写回xml文档
from xml.etree.ElementTree import parse, Element
# 第一步是以通常的方式来解析这个文档
doc = parse('movies.xml') 
root = doc.getroot()
root

<Element 'collection' at 0x7f380824b1d8>

## 建立xml文档

In [42]:
# 修改文档可以使用 Element 对象的方法：
root = tree.getroot()
# 删除了根元素的第三个子结点
del root[3]
# 为第一个子结点增加新状态
root[0].set('foo', 'bar')
for subelem in root:
    print(subelem.tag, subelem.attrib)

movie {'foo': 'bar', 'title': 'Enemy Behind'}
movie {'title': 'Transformers'}
movie {'title': 'Trigun'}


In [43]:
# 这个树可以写回到文件中
import sys
tree.write(sys.stdout) # ET.dump can also serve this purpose

<collection shelf="New Arrivals">
<movie foo="bar" title="Enemy Behind">
   <type>War, Thriller</type>
   <format>DVD</format>
   <year>2003</year>
   <rating>PG</rating>
   <stars>10</stars>
   <description>Talk about a US-Japan war</description>
</movie>
<movie title="Transformers">
   <type>Anime, Science Fiction</type>
   <format>DVD</format>
   <year>1989</year>
   <rating>R</rating>
   <stars>8</stars>
   <description>A schientific fiction</description>
</movie>
   <movie title="Trigun">
   <type>Anime, Action</type>
   <format>DVD</format>
   <episodes>4</episodes>
   <rating>PG</rating>
   <stars>10</stars>
   <description>Vash the Stampede!</description>
</movie>
</collection>

In [44]:
# 建立一个全新的元素。ET 模块提供了 SubElement 函数来简化过程：
a = ET.Element('elem')
a_1 = ET.SubElement(a, 'child1')
a_1.text = 'some text'
a_2 = ET.SubElement(a, 'child2')
b = ET.Element('elem_b')
root = ET.Element('root')
root.extend((a, b))
tree = ET.ElementTree(root)
tree.write(sys.stdout)

<root><elem><child1>some text</child1><child2 /></elem><elem_b /></root>

## 使用 iterparse 来处理 XML 流

In [51]:
import xml.etree.ElementTree as ET
#加载并解析XML文件
tree = ET.ElementTree(file = 'movies.xml') # file = 一定不能忘
count = 0
for elem in tree.iter(tag = 'format'):
    if elem.text == 'DVD':
        count += 1
print(count)

3


## dict to xml

In [13]:
# dict to xml 你想使用一个Python字典存储数据，并将它转换成XML格式。
from xml.etree.ElementTree import Element
def dict_to_xml(tag, d):
#Turn a simple dict of key/value pairs into XML
    elem = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    return elem

s = { 'name': 'GOOG', 'shares': 100, 'price':490.1 }
e = dict_to_xml('stock', s)
e # 转换结果是一个 Element 实例。

<Element 'stock' at 0x7f0360497228>

## DOM(Document Object Model)

一个 DOM 的解析器在解析一个 XML 文档时，一次性读取整个文档，把文档中所有元素保存在内存中的一个树结构里，之后你可以利用DOM 提供的不同的函数来读取或修改文档的内容和结构，也可以把修改过的内容写入xml文件。
python中用xml.dom.minidom来解析xml文件:

In [5]:
import xml.dom.minidom
from xml.dom.minidom import parse
# 使用minidom解析器打开xml文档
DOMTree = xml.dom.minidom.parse('movies.xml')
collection = DOMTree.documentElement
if collection.hasAttribute('shelf'):
    print('Root element : %s' % collection.getAttribute('shelf'))
    
# 在集合中获取所有电影
movies = collection.getElementsByTagName('movie')

# 打印每部电影的详细信息
for movie in movies:
    print('**Movie**')
    if movie.hasAttribute('title'):
        print('Title: %s' % movie.getAttribute('title'))
        
    type = movie.getElementsByTagName('type')[0]
    print('Type %s' % type.childNodes[0].data)
    format = movie.getElementsByTagName('rating')[0]
    print ("Format: %s" % format.childNodes[0].data)
    rating = movie.getElementsByTagName('rating')[0]
    print ("Rating: %s" % rating.childNodes[0].data)
    description = movie.getElementsByTagName('description')[0]
    print ("Description: %s" % description.childNodes[0].data)

Root element : New Arrivals
**Movie**
Title: Enemy Behind
Type War, Thriller
Format: PG
Rating: PG
Description: Talk about a US-Japan war
**Movie**
Title: Transformers
Type Anime, Science Fiction
Format: R
Rating: R
Description: A schientific fiction
**Movie**
Title: Trigun
Type Anime, Action
Format: PG
Rating: PG
Description: Vash the Stampede!
**Movie**
Title: Ishtar
Type Comedy
Format: PG
Rating: PG
Description: Viewable boredom
