# What is XML?

XML stands for "Extensible Markup Language". It is mainly used in webpages, where the data has a specific structure and is understood dynamically by the XML framework. 

In [1]:
from xml.etree import ElementTree
import pandas as pd
import xml.etree.cElementTree as et


In [2]:
tree = ElementTree.parse("country_data.xml")
ElementTree.dump(tree)

<data>
  <country name="Liechtenstein">
    <rank>1</rank>
    <year>2008</year>
    <gdppc>141100</gdppc>
    <neighbor direction="E" name="Austria" />
    <neighbor direction="W" name="Switzerland" />
  </country>
  <country name="Singapore">
    <rank>4</rank>
    <year>2011</year>
    <gdppc>59900</gdppc>
    <neighbor direction="N" name="Malaysia" />
  </country>
  <country name="Panama">
    <rank>68</rank>
    <year>2011</year>
    <gdppc>13600</gdppc>
    <neighbor direction="W" name="Costa Rica" />
    <neighbor direction="E" name="Colombia" />
  </country>
</data>


From what the above Xml file, you see that

    <data> is the single root element: it contains all the other elements, such as <country name>, or <neighbor>, which are the child elements or subelements. As you can see, these elements are nested.


In [3]:

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None
 
 
def main():
    """ main """
    parsed_xml = et.parse("country_data.xml")
    dfcols = ['name','rank', 'year', 'gdppc','neighbor_name'] 
    df_xml = pd.DataFrame(columns=dfcols)
 
    for node in parsed_xml.getroot():
        name = node.attrib.get('name')
        rank = node.find('rank')
        year = node.find('year')
        gdppc = node.find('gdppc')
        neighbor_name = node.attrib.get('neighbor direction')
 
        df_xml = df_xml.append(
            pd.Series([name,getvalueofnode(rank), getvalueofnode(year),
                       getvalueofnode(gdppc),neighbor_name], index=dfcols),
            ignore_index=True)
 
    print(df_xml)
 
main()

            name rank  year   gdppc neighbor_name
0  Liechtenstein    1  2008  141100          None
1      Singapore    4  2011   59900          None
2         Panama   68  2011   13600          None


In [4]:
import xml.etree.ElementTree as ET
tree = ET.parse('country_data.xml')
root = tree.getroot()

# root has a tg

In [9]:
root.tag

'data'

# root has a dictionary of attributes

In [10]:
root.attrib

{}

# we can iterate children nodes of a root

In [13]:
for child in root :
    print(child.tag,child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


# We can expand the use of the iter() function to help with finding particular elements of interest. root.iter() will list all subelements under the root that match the element specified. Here, you will list all attributes of the neighbor element in the tree:

In [16]:
for neighbor in root.iter('neighbor'):
    print (neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


# It is helpful to know all the elements in the entire tree. One useful function for doing that is root.iter()

In [17]:
[elem.tag for elem in root.iter()]

['data',
 'country',
 'rank',
 'year',
 'gdppc',
 'neighbor',
 'neighbor',
 'country',
 'rank',
 'year',
 'gdppc',
 'neighbor',
 'country',
 'rank',
 'year',
 'gdppc',
 'neighbor',
 'neighbor']

This gives a general notion for how many elements you have, but it does not show the attributes or levels in the tree.

# Here, we search the tree for rank that came out in 2008:

In [23]:
for country in root.findall("./country/rank/gdppc/[year='2008']"):
    print(country.attrib)