In [2]:
# This is the package we will be using for interacting with xml
from lxml import etree
import csv

In [3]:
# All XML objects have a root element
root = etree.Element("root")

# Lets define a function that makes it a bit easier to visualize the tree
def print_tree(root):
    print(etree.tostring(root, pretty_print=True))

# We can use the function like this:
print_tree(root)

b'<root/>\n'


In [4]:
# We can add elements to the root by using SubElement
etree.SubElement(root, "child1")
etree.SubElement(root, "child2")
etree.SubElement(root, "child3")

print_tree(root)

b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'


In [5]:
# We can iterate over all the children of a node by using a for loop

for child in root:
    print(child) # This will print the object, i.e <Element child1 at 0x1d5c17f0080
    print(child.tag) # This will print the child's name (for instance "child1")

    

<Element child1 at 0x7fdd67c90040>
child1
<Element child2 at 0x7fdd67c90100>
child2
<Element child3 at 0x7fdd67c90480>
child3


In [6]:
# We can also use the lxml package to read xml files into a tree object
# Download this file https://www.w3schools.com/xml/note.xml and place it in the same folder as this notebook 

# First we have to open the file and get the xml as a string
def xml_from_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        xmlstring = f.read()
        return etree.fromstring(xmlstring.encode())

# Lets try to see if we can read a simple xml string
root = xml_from_file("note.xml")
# We can see that we are able to print its children
for child in root:
    print(child)


<Element to at 0x7fdd67cb3fc0>
<Element from at 0x7fdd67cb3980>
<Element heading at 0x7fdd67c90480>
<Element body at 0x7fdd67c90040>


In [7]:
# We can also access the elements using its tags:
for child in root:
    print(child.tag, child.text)

to Tove
from Jani
heading Reminder
body Don't forget me this weekend!


In [8]:
# If you want a specific field for all children, you can use iter
for child in root.iter("to"):
    print(child.tag, child.text)

to Tove


In [9]:
# ... but this was just a reaally small xml file
# Lets try with a bigger one: https://www.w3schools.com/xml/plant_catalog.xml 
# We can reuse the functions we created above
root = xml_from_file("plant_catalog.xml")
for child in root[0:2]: # Print the first two children
    print(child.tag, child.getchildren())



PLANT [<Element COMMON at 0x7fdd67cb3600>, <Element BOTANICAL at 0x7fdd66c20a00>, <Element ZONE at 0x7fdd66c207c0>, <Element LIGHT at 0x7fdd66c20700>, <Element PRICE at 0x7fdd66c20740>, <Element AVAILABILITY at 0x7fdd66c20b80>]
PLANT [<Element COMMON at 0x7fdd66c20a80>, <Element BOTANICAL at 0x7fdd66c20f80>, <Element ZONE at 0x7fdd66c20a00>, <Element LIGHT at 0x7fdd66c20980>, <Element PRICE at 0x7fdd66c207c0>, <Element AVAILABILITY at 0x7fdd66c20700>]


In [10]:
# Lets get the common names of all plants:
for child in root.iter("COMMON"):
    print(child.text)

Bloodroot
Columbine
Marsh Marigold
Cowslip
Dutchman's-Breeches
Ginger, Wild
Hepatica
Liverleaf
Jack-In-The-Pulpit
Mayapple
Phlox, Woodland
Phlox, Blue
Spring-Beauty
Trillium
Wake Robin
Violet, Dog-Tooth
Trout Lily
Adder's-Tongue
Anemone
Grecian Windflower
Bee Balm
Bergamot
Black-Eyed Susan
Buttercup
Crowfoot
Butterfly Weed
Cinquefoil
Primrose
Gentian
Blue Gentian
Jacob's Ladder
Greek Valerian
California Poppy
Shooting Star
Snakeroot
Cardinal Flower


In [11]:
# From the output above we can see that this xml is deeper - it has children that has children!
# Lets try to print a single child:

for child in root[0]: #Select only the first child of root and print its children
    print(child.tag, child.text)


COMMON Bloodroot
BOTANICAL Sanguinaria canadensis
ZONE 4
LIGHT Mostly Shady
PRICE $2.44
AVAILABILITY 031599


In [12]:
# Now that we know what a single plant looks like, we can try to loop through every child and map it to a dict
# We will end up with a list of dicts, which is perfect for our csv.DictWriter! 

rows = []
for plant in root:
    plantdict = {} # Create a new plant dict for each row
    for attributes in plant:
        # print(attributes.tag, attributes.text)
        plantdict[attributes.tag] = attributes.text
        rows.append(plantdict)
print(rows[0:3])# Print some of the rows as a demo


[{'COMMON': 'Bloodroot', 'BOTANICAL': 'Sanguinaria canadensis', 'ZONE': '4', 'LIGHT': 'Mostly Shady', 'PRICE': '$2.44', 'AVAILABILITY': '031599'}, {'COMMON': 'Bloodroot', 'BOTANICAL': 'Sanguinaria canadensis', 'ZONE': '4', 'LIGHT': 'Mostly Shady', 'PRICE': '$2.44', 'AVAILABILITY': '031599'}, {'COMMON': 'Bloodroot', 'BOTANICAL': 'Sanguinaria canadensis', 'ZONE': '4', 'LIGHT': 'Mostly Shady', 'PRICE': '$2.44', 'AVAILABILITY': '031599'}]


In [13]:
# Now that we have a list of dicts we can use it to write to a csv

import csv 

with open("plants.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=["COMMON", "BOTANICAL", "PRICE", "LIGHT", "AVAILABILITY", "ZONE"])
    writer.writeheader()
    writer.writerows(rows)

## Exercise

1. Load [this](https://www.w3schools.com/xml/simple.xml) CSV and play around with it. Try to print some of the elements to get a feel of how this XML looks.


In [14]:
root = xml_from_file('simple.xml')
print(root)
for child in root:
    print(child)
    for i in child:
        print(i)


<Element breakfast_menu at 0x7fdd67ce7540>
<Element food at 0x7fdd66878740>
<Element name at 0x7fdd66878680>
<Element price at 0x7fdd66878240>
<Element description at 0x7fdd668789c0>
<Element calories at 0x7fdd66878680>
<Element food at 0x7fdd668782c0>
<Element name at 0x7fdd668789c0>
<Element price at 0x7fdd67cdc680>
<Element description at 0x7fdd66878680>
<Element calories at 0x7fdd668789c0>
<Element food at 0x7fdd66c2a600>
<Element name at 0x7fdd66878680>
<Element price at 0x7fdd67cdc680>
<Element description at 0x7fdd668789c0>
<Element calories at 0x7fdd66878680>
<Element food at 0x7fdd66878740>
<Element name at 0x7fdd668789c0>
<Element price at 0x7fdd67cdc680>
<Element description at 0x7fdd66878680>
<Element calories at 0x7fdd668789c0>
<Element food at 0x7fdd668782c0>
<Element name at 0x7fdd66878c80>
<Element price at 0x7fdd66878680>
<Element description at 0x7fdd668789c0>
<Element calories at 0x7fdd66878c80>


2. Iterate over all the children of the root node and print the Food name of each menu item. 


In [15]:
for child in root.iter('name'):
    print(child.text)

Belgian Waffles
Strawberry Belgian Waffles
Berry-Berry Belgian Waffles
French Toast
Homestyle Breakfast


3. Create a function that accepts input from the user (using input()) and finds the Food item that has the same name as the one the user specified. E.g if the user writes "French Toast", the function should return 
```
name: French Toast
price: $4.50
description: Thick slices made from our homemade sourdough bread
calories: 600
```
If there is no item with the same name, it should return
```
Sorry we don't have that food!
```


In [16]:
input_food = input('Enter food: ')
found = None
for c,food in enumerate(root.iter('name')):
    if input_food == food.text:
        found = True
        for i in root[c].getchildren():
            print(f'''{i.tag}: {i.text}''')
if not found:
    print('Sorry we dont have that food')

Sorry we dont have that food


4. Iterate through all the items to find the item with the highest price.
Hint: `"$4.50".strip('$')` removes the dollar part of the string, so that you can convert it to float for comparison


In [17]:
max = 0
for child in root.iter('price'):
    if float(child.text.strip('$')) > max:
        max = float(child.text.strip('$'))
print(f'''${max}''')

$8.95


5. Write all the Food that costs over $6 on the menu to a CSV with columns `name, price, description,calories`

In [43]:
dic = {}
nic = {}
ls = []
max = 6
for c,child in enumerate(root.iter('price')):
    if float(child.text.strip('$')) > max:
        for i in root[c].getchildren():
            ls.append((i.tag))
            dic[i.tag] = i.text
        nic[c] = dic
        dic = {}

for i in nic:
    nic[i]['price'] = f'''{float(nic[i]['price'])*10} NOK'''

with open('menu.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=set(ls))
    writer.writeheader()
    for key,value in nic.items():
        writer.writerow(value)
