# XML
- SAX = Simple Api for XML
    - never loads whole file into memory -> useful with very large xml files
    - can only read, not change
- DOM = Document Object Model
    - Change in xml document -> DOM needed
    - Works with DOM Tree -> Hierarchical structure of document as Tree
        - Element is part of the dom tree
- Minidom
    - Minimal implementation of the DOM module
    - DOM 
- BeautifulSoup
    - Scrapping library
    - find(), find_all(), parent, children, previous_siblings, next_siblings
- xml.etree
    - tag / attribute / text
    - set() / iter() / write()
    - SubElement() to create new elements
- xmltodict
    - parse() and unparse()
    

In [6]:
# SAX with own handler
# Handler: Works with the file
# Parser: Translates the file
import xml.sax

class GroupHandler(xml.sax.ContentHandler):
    def startElement(self, name, attrs):
        # print(name)
        self.current = name
        if self.current == 'book':
            print("----BOOK----")
            print(f"ID is {attrs['id']}")

    def characters(self, content):
        if self.current == 'title':
            self.title = content
        elif self.current == 'price':
            self.price = content

    def endElement(self, name):
        if self.current == 'title':
            print("Title: {}".format(self.title))
        elif self.current == 'price':
            print("Price: {}".format(self.price))
        self.current = ""

handler = GroupHandler()
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
parser.parse('testfile.xml')

----BOOK----
ID is bk101
Title: XML Developer's Guide
Price: 44.95
----BOOK----
ID is bk102
Title: Midnight Rain
Price: 5.95
----BOOK----
ID is bk103
Title: Maeve Ascendant
Price: 5.95
----BOOK----
ID is bk104
Title: Oberon's Legacy
Price: 5.95
----BOOK----
ID is bk105
Title: The Sundered Grail
Price: 5.95
----BOOK----
ID is bk106
Title: Lover Birds
Price: 4.95
----BOOK----
ID is bk107
Title: Splish Splash
Price: 4.95
----BOOK----
ID is bk108
Title: Creepy Crawlies
Price: 4.95
----BOOK----
ID is bk109
Title: Paradox Lost
Price: 6.95
----BOOK----
ID is bk110
Title: Microsoft .NET: The Programming Bible
Price: 36.95
----BOOK----
ID is bk111
Title: MSXML3: A Comprehensive Guide
Price: 36.95
----BOOK----
ID is bk112
Title: Visual Studio 7: A Comprehensive Guide
Price: 49.95


In [21]:
import xml.dom.minidom

# Whole document parsed into DOM-Tree and loaded into memory
domtree = xml.dom.minidom.parse('testfile.xml')

# find main object (root element)
group = domtree.documentElement
print(group)

books = group.getElementsByTagName('book')
print(books)

for book in books:
    if book.hasAttribute('id'):
        print("ID: {}".format(book.getAttribute('id')))

    print(f"Title: {book.getElementsByTagName('title')[0].childNodes[0].data}")
    print(f"Genre: {book.getElementsByTagName('genre')[0].childNodes[0].data}")
    print(f"Price: {book.getElementsByTagName('price')[0].childNodes[0].data}")



<DOM Element: catalog at 0x11483b5e0>
[<DOM Element: book at 0x11483b040>, <DOM Element: book at 0x11483bdc0>, <DOM Element: book at 0x114dc7790>, <DOM Element: book at 0x114dc7700>, <DOM Element: book at 0x114dc7ca0>, <DOM Element: book at 0x114d0b1f0>, <DOM Element: book at 0x114d0bc10>, <DOM Element: book at 0x114d0bca0>, <DOM Element: book at 0x11494c670>, <DOM Element: book at 0x11494c0d0>, <DOM Element: book at 0x11494c1f0>, <DOM Element: book at 0x11494c160>]
ID: bk101
Title: XML Developer's Guide
Genre: Computer
Price: 44.95
ID: bk102
Title: Midnight Rain
Genre: Fantasy
Price: 5.95
ID: bk103
Title: Maeve Ascendant
Genre: Fantasy
Price: 5.95
ID: bk104
Title: Oberon's Legacy
Genre: Fantasy
Price: 5.95
ID: bk105
Title: The Sundered Grail
Genre: Fantasy
Price: 5.95
ID: bk106
Title: Lover Birds
Genre: Romance
Price: 4.95
ID: bk107
Title: Splish Splash
Genre: Romance
Price: 4.95
ID: bk108
Title: Creepy Crawlies
Genre: Horror
Price: 4.95
ID: bk109
Title: Paradox Lost
Genre: Science Fi

In [25]:
# Minidom 2 - Change Values

# Change in domtree
books[2].getElementsByTagName('title')[0].childNodes[0].nodeValue = 'New book'
print(books[2].getElementsByTagName('title')[0].childNodes[0].data)
books[0].setAttribute('id', '101')
print("ID: {}".format(books[0].getAttribute('id')))

# Add element to domtree
new_book = domtree.createElement('book')
new_book.setAttribute('id', 'bk999')
title = domtree.createElement('title')
title.appendChild(domtree.createTextNode('Marios book'))

new_book.appendChild(title)

group.appendChild(new_book)

domtree.writexml(open('testfile2.xml','w'))

New book
ID: 101


In [5]:
from bs4 import BeautifulSoup

with open('testfile.xml', 'r') as file:
    content = file.readlines()

    # Combine list into a string
    content = ''.join(content)

    bs_xml_content = BeautifulSoup(content, 'xml')

first_book = bs_xml_content.find('book')
first_author = bs_xml_content.find('author')
books = bs_xml_content.find_all('book')

# Filter based on attributes
specific_book = bs_xml_content.find('book', {"id": "bk102"})
# print(first_book)
# print(first_author)
# print(books)
print(specific_book)

for item in specific_book:
    print(item)

<book id="bk102">
<author>Ralls, Kim</author>
<title>Midnight Rain</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.</description>
</book>


<author>Ralls, Kim</author>


<title>Midnight Rain</title>


<genre>Fantasy</genre>


<price>5.95</price>


<publish_date>2000-12-16</publish_date>


<description>A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.</description>




In [17]:
# Parsing + Finding Elements

import xml.etree.ElementTree as ET 

mytree = ET.parse('testfile.xml')
# mytree = ET.fromstring(data)
myroot = mytree.getroot()
print(myroot)
print(myroot.tag)

# Attributes saved as dictionaries
for book in myroot:
    print(book.tag, book.attrib)

for entry in myroot[0]:
    print(entry.text)

# Find all authors + prices
for book in myroot.findall('book'):
    author = book.find('author').text
    price = book.find('price').text
    print(author, price)

<Element 'catalog' at 0x10df56680>
catalog
book {'id': 'bk101'}
book {'id': 'bk102'}
book {'id': 'bk103'}
book {'id': 'bk104'}
book {'id': 'bk105'}
book {'id': 'bk106'}
book {'id': 'bk107'}
book {'id': 'bk108'}
book {'id': 'bk109'}
book {'id': 'bk110'}
book {'id': 'bk111'}
book {'id': 'bk112'}
Gambardella, Matthew
XML Developer's Guide
Computer
44.95
2000-10-01
An in-depth look at creating applications 
      with XML.
Gambardella, Matthew 44.95
Ralls, Kim 5.95
Corets, Eva 5.95
Corets, Eva 5.95
Corets, Eva 5.95
Randall, Cynthia 4.95
Thurman, Paula 4.95
Knorr, Stefan 4.95
Kress, Peter 6.95
O'Brien, Tim 36.95
O'Brien, Tim 36.95
Galos, Mike 49.95


In [18]:
# Modify XML files
for description in myroot.iter('description'):
    new_desc = str(description.text) + ' tzzz'
    description.text = str(new_desc)
    description.set('updated', 'yes')

mytree.write('testfile3.xml')

In [20]:
# Adding Elements
ET.SubElement(myroot[0], 'speciality')

for element in myroot.iter('speciality'):
    b = 'Some speciality'
    element.text = str(b)

mytree.write('testfile3.xml')

In [23]:
# Removing attributes
# myroot[0][0].attrib.pop('age')

# Remove element
myroot[0].remove(myroot[0][1])
mytree.write('testfile4.xml')

In [38]:
import xmltodict
import pprint

with open('testfilepalo.xml', 'r') as file:
    content = file.readlines()

    # Combine list into a string
    content = ''.join(content)
    
    palocfg = xmltodict.parse(content, process_namespaces=True)

pprint.pprint(palocfg)

        ('cve',
                                                                                                                                                           OrderedDict([('member',
                                                                                                                                                                         'any')])),
                                                                                                                                                          ('vendor-id',
                                                                                                                                                           OrderedDict([('member',
                                                                                                                                                                         'any')])),
                                                                                                  

In [41]:
#from xml.dom.minidom import parseString
#from dicttoxml import dicttoxml
#xml = dicttoxml(palocfg)
#dom = parseString(xml)
#print(dom.toprettyxml())
out = xmltodict.unparse(palocfg, pretty=True)
#print(type(out))
with open('testfilepaloout.xml', 'a') as file:
    file.write(out)

<class 'str'>
