In [None]:
import xml.etree.ElementTree as ET

In [None]:
#load data
tree = ET.parse('data.xml')

In [None]:
#object type
print(type(tree))

In [None]:
#main root of file
root = tree.getroot()
root

In [None]:
print(root.tag)
print(root.attrib)
print(len(root))
#Now, root represents the top element of the file. We can check its tag and `attributes.


In [None]:
#We can see that the length of this element is 3. 
#This means that it has 3 children. We can access these children the same way as elements in a list.

# First child of the root
country1 = root[0]

# First child of the child
rank = country1[0]

# What is the tag of the grandchild
print(rank.tag)

# What is the text inside this grandchild
print(rank.text)

# What are the attributes of last element?
print(country1[4].attrib)


In [None]:
#To extract the information from all children we need to iterate through the file. We have a couple of options.

# Find all child with tag country
for country in root.findall('country'):
    # rank is child of the country
    rank = country.find('rank').text
    # name is attribute of the country
    name = country.get('name')
    print(name, rank)

In [None]:
#We can also look for grandchildren directly if we know their tag:

for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

In [None]:
#Here are some tips and tricks on how to work with root.findall():

# Top-level elements
root.findall(".")

# All 'neighbor' grand-children of 'country' children of the top-level elements
root.findall("./country/neighbor")

# elements with name='Singapore' that have a 'year' child
root.findall(".//year/..[@name='Singapore']")

# 'year' elements that are children of elements with name='Singapore'
root.findall(".//*[@name='Singapore']/year")

# All 'neighbor' elements that are the second child of their parent
root.findall(".//neighbor[2]")

In [None]:
#Extract the name, rank, year and gdppc from the countries and create a Pandas DataFrame.

import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('data.xml')  # Load from file
root = tree.getroot()

my_dict = {'name': [],
           'rank': [],
           'year': [],
           'gdppc': []}


for country in root:
    name_value = country.attrib['name']
    my_dict['name'].append(name_value)

    rank_value = country[0].text
    my_dict['rank'].append(rank_value)

    year_value = country[1].text
    my_dict['year'].append(year_value)

    gdppc_value = country[2].text
    my_dict['gdppc'].append(gdppc_value)

df = pd.DataFrame(my_dict) 
df

In [None]:
#Because all children of the root are countries therefore for country in root: equals for country in root.findall('country'):.