# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [4]:
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [5]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [6]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [7]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [8]:
document = ET.parse( './data/mondial_database.xml' )

In [9]:
country_list_inf = []
country_list = []
for element in document.getroot():
    for subelement in element.getiterator('infant_mortality'):
        country_list.append( element.find('name').text )
        country_list_inf.append( np.float(subelement.text))
        #print element.find('name').text+' : '+subelement.text+','
        #country_list.append(element.find('name').text)
        #country_inf_mortality = np.hstack([country_inf_mortality,np.float(subelement.text)])

In [10]:
cim = {'Country':country_list, 'Infant_mortality':country_list_inf}
df = pd.DataFrame(data=cim)
df.head()

Unnamed: 0,Country,Infant_mortality
0,Albania,13.19
1,Greece,4.78
2,Macedonia,7.9
3,Serbia,6.16
4,Andorra,3.69


In [11]:
df.sort_values('Infant_mortality',ascending=True).head(10)
# The top 10 countries with the lowest mortality rates

Unnamed: 0,Country,Infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


## 10 cities with the largest population

In [130]:
city_list = []
census_year_list = []
population_list = []

for element in document.getroot():
    #print element.find('name').text
    for subelement in element.getiterator('city'):
        #print subelement.find('name').text
        for subsubelement in subelement.getiterator('population'):
            if (subsubelement.get('measured')=='census'):
                city_list.append(subelement.find('name').text)
                census_year_list.append(np.float(subsubelement.get('year')))
                population_list.append(np.float(subsubelement.text))


In [132]:
city_year_population = {'City':city_list,'Year':census_year_list,'Population':population_list}

In [161]:
df2 = pd.DataFrame(data=city_year_population)
df3 = df2.sort_values('Population',ascending=False)

In [170]:
df3.head()

Unnamed: 0,City,Population,Year
2190,Shanghai,22315474,2010
2189,Shanghai,15758892,2000
2669,Delhi,12877470,2001
2599,Mumbai,12442373,2011
2598,Mumbai,11914398,2001


In [204]:
df3.set_index(['City','Year']).head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
City,Year,Unnamed: 2_level_1
Shanghai,2010,22315474
Shanghai,2000,15758892
Delhi,2001,12877470
Mumbai,2011,12442373
Mumbai,2001,11914398
Beijing,2010,11716620
Moskva,2010,11612885
São Paulo,2010,11152344
Tianjin,2010,11090314
Guangzhou,2010,11071424


## name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
document = ET.parse( './data/mondial_database.xml' )
airport_list = []
elevation_list = []
country_list = []
for element in document.getroot():
    for subelement in element.getiterator('airport'):
        #print subelement.find('name').text
        for subsubelement in subelement.getiterator('elevation'):
            airport_list.append(subelement.find('name').text)
            elevation_list.append(subsubelement.text )
            print subelement.find('country')

NameError: name 'ET' is not defined

In [252]:
airport_elevation = {'Airport':airport_list,'Elevation':elevation_list}

In [253]:
df4 = pd.DataFrame(data=airport_elevation)
df4['Elevation'] = df4['Elevation'].astype(float)

In [254]:
df4.sort_values('Elevation',ascending=False).head(10)

Unnamed: 0,Airport,Elevation
80,El Alto Intl,4063
219,Lhasa-Gonggar,4005
241,Yushu Batang,3963
813,Juliaca,3827
815,Teniente Alejandro Velasco Astete Intl,3311
82,Juana Azurduy De Padilla,2905
334,Mariscal Sucre Intl,2813
805,Coronel Fap Alfredo Mendivil Duarte,2719
807,Mayor General FAP Armando Revoredo Iglesias Ai...,2677
692,Licenciado Adolfo Lopez Mateos Intl,2581
