# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [17]:
from xml.etree import ElementTree as ET

import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [5]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
document_tree

<xml.etree.ElementTree.ElementTree at 0x1c6d44f3cc0>

In [6]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [8]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [45]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()
root.attrib
for child in root:
    for children in child:
        print(children)
    print("\n")

<Element 'name' at 0x000001C6D6AF9728>
<Element 'population' at 0x000001C6D6AF9908>
<Element 'population' at 0x000001C6D6AF94F8>
<Element 'population' at 0x000001C6D5FD3228>
<Element 'population' at 0x000001C6D5FD3318>
<Element 'population' at 0x000001C6D5FD32C8>
<Element 'population' at 0x000001C6D5FD30E8>
<Element 'population' at 0x000001C6D5FD3188>
<Element 'population' at 0x000001C6D5FD31D8>
<Element 'population' at 0x000001C6D5FD3278>
<Element 'population_growth' at 0x000001C6D8BEFDB8>
<Element 'infant_mortality' at 0x000001C6D6AD76D8>
<Element 'gdp_total' at 0x000001C6D6AD7868>
<Element 'gdp_agri' at 0x000001C6D6AD7598>
<Element 'gdp_ind' at 0x000001C6D6AD7A98>
<Element 'gdp_serv' at 0x000001C6D6B00818>
<Element 'inflation' at 0x000001C6D6B00B88>
<Element 'unemployment' at 0x000001C6D6B009A8>
<Element 'indep_date' at 0x000001C6D6B009F8>
<Element 'government' at 0x000001C6D6B00278>
<Element 'encompassed' at 0x000001C6D6B00408>
<Element 'ethnicgroup' at 0x000001C6D6B00458>
<Element

In [32]:
# 10 countries with the lowest infant mortality rates
data = {'country':[], 'infant_mortality':[]}

for element in document.iterfind('country'):
    data['country'].append(element.find('name').text)
    mortality = element.find('infant_mortality')
    if mortality is not None:
        data['infant_mortality'].append(mortality.text)
    else:
        data['infant_mortality'].append('')
        
df = pd.DataFrame(data)
df['infant_mortality']=pd.to_numeric(df['infant_mortality'],errors='coerce')
df.sort_values( by ='infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [25]:
#10 cities with the largest population

In [34]:
df2 = pd.DataFrame(columns=['city','population'])
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        if subelement.find('population') is not None:
            df3=pd.DataFrame([[subelement.find('name').text,subelement.find('population').text]], columns=['city','population'])
            df2 = df.append(df3)

df['population']=pd.to_numeric(df['population'],errors='coerce')
df.sort_values(by='population',ascending=False).head(10)

Unnamed: 0,city,population
0,Seoul,10229262
0,Mumbai,9925891
0,São Paulo,9412894
0,Jakarta,8259266
0,Shanghai,8205598
0,Ciudad de México,8092449
0,Moskva,8010954
0,Tokyo,7843000
0,Beijing,7362426
0,Delhi,7206704


In [35]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [38]:
ethnic = pd.DataFrame(columns=['country','ethnicgroup','population','percentage'])
for element in document.iterfind('country'):
    for subelement in element.getiterator('ethnicgroup'):
        country=element.find('name').text
        ethnicgroup=subelement.text
        population=element.find('population').text
        percentage=subelement.attrib['percentage']
        
        df5=pd.DataFrame([[country,ethnicgroup,population,percentage]], columns=['country','ethnicgroup','population','percentage'])
        ethnic = ethnic.append(df5)

ethnic['population']=pd.to_numeric(ethnic['population'],errors='coerce')   
ethnic['percentage']=pd.to_numeric(ethnic['percentage'],errors='coerce') 
ethnic['ethnic_population']=ethnic['population']*ethnic['percentage']/100
ethnic.groupby(['ethnicgroup']).sum().sort_values(by='ethnic_population',ascending=False).head(10)

Unnamed: 0_level_0,population,percentage,ethnic_population
ethnicgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Han Chinese,543776080,91.5,497555100.0
European,362717873,970.82,192865800.0
Indo-Aryan,238396327,72.0,171645400.0
Russian,202263854,224.1,92758440.0
African,357529690,1868.55,86329370.0
Japanese,82199470,99.4,81706270.0
German,145710759,165.6,66232190.0
Dravidian,238396327,25.0,59599080.0
English,50616012,83.6,42314990.0
Mestizo,67185932,870.7,35542330.0


In [54]:
#name and country of a) longest river, b) largest lake and c) airport at highest elevation

#longest river
longest_river_country=None
longest_river_name=None
longest_river_length=0
for node in document.iterfind('river'):
    if node.find('length') is not None:
        if  longest_river_length < float(node.find('length').text):
            longest_river_country=node.attrib['country']
            longest_river_name=node.attrib['id']
            longest_river_length=float(node.find('length').text)

print (longest_river_name, longest_river_country, str(longest_river_length))

river-Amazonas CO BR PE 6448.0


In [57]:
#largest lake
largest_lake_country=None
largest_lake=None
largest_lake_size=0
for node in document.iterfind('lake'):
    if node.find('area') is not None:
        if  largest_lake_size < float(node.find('area').text):
            largest_lake_country=node.attrib['country']
            largest_lake=node.attrib['id']
            largest_lake_size=float(node.find('area').text)

print (largest_lake, largest_lake_country, str(largest_lake_size))

lake-KaspischesMeer R AZ KAZ IR TM 386400.0


In [75]:
#airport at highest elevation
highest_airport_country=None
highest_airport=None
highest_airport_elevation=0
for node in document.iterfind('airport'):
    if node.find('elevation').text is not None:
        if  highest_airport_elevation < float(node.find('elevation').text):
            highest_airport_country = node.attrib['country']
            highest_airport = node.find('name').text
            highest_airport_elevation = float(node.find('elevation').text)

print (highest_airport, highest_airport_country, str(highest_airport_elevation))

El Alto Intl BOL 4063.0
