# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

In [6]:
# solution to problem 1: top 10 countries with the lowest infant mortality

countrynames = []
infant_mortalities = []

for country in root.findall('country'):
    try:
        countrynames.append(country.find('name').text)
    except:
        countrynames.append(np.nan)
    try:
        infant_mortalities.append(float(country.find('infant_mortality').text))
    except:
        infant_mortalities.append(np.nan)

s = pd.Series(data=infant_mortalities, index=countrynames, dtype=float, name='infant_mortality')
s.sort_values(ascending=True).head(10)

Monaco            1.81
Japan             2.13
Bermuda           2.48
Norway            2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
Name: infant_mortality, dtype: float64

In [7]:
# solution to problem 2: top 10 cities with the largest population 
# cities seem to have multiple population values 
# using the most recent of all of the population values for this problem 

citynames = []
populations = []
for country in root.findall('country'):
    for city in country.findall('city'):
        
        # figure out the most recent population of the city and add it to the lists of citynames and populations
        most_recent_year = 0
        most_recent_population = 0
        for population in city.findall('population'):
            if float(population.attrib['year']) > most_recent_year:
                most_recent_year = float(population.attrib['year'])
                most_recent_population = float(population.text)
        populations.append(most_recent_population)
        citynames.append(city.find('name').text)
        
# process the data to answer the question 
s = pd.Series(data=populations, index=citynames, name='city_populations')
s.sort_values(ascending=False).head(10)

Seoul              9708483.0
Al Qahirah         8471859.0
Bangkok            7506700.0
Hong Kong          7055071.0
Ho Chi Minh        5968384.0
Singapore          5076700.0
Al Iskandariyah    4123869.0
New Taipei         3939305.0
Busan              3403135.0
Pyongyang          3255288.0
Name: city_populations, dtype: float64

In [8]:
# solution to problem 3: top 10 ethnic groups with the largest overall population 

ethnic_groups = []

for country in root.findall('country'):
    
    # figure out the most recent population of the country
    most_recent_year = 0
    most_recent_population = 0
    for population in country.findall('population'):
        if float(population.attrib['year']) > most_recent_year:
            most_recent_year = float(population.attrib['year'])
            most_recent_population = float(population.text)

    # figure out the ethnic groups and their populations in this coutry and append it to ethnic_groups list
    mylist = [[x.text, float(x.attrib['percentage']) * most_recent_population / 100.] for x in country.findall('ethnicgroup')]
    for group in mylist:
        ethnic_groups.append(group)
        
# process the data to answer the question
ethnic_groups = pd.DataFrame(ethnic_groups, columns=['ethnicgroup', 'population'])
ethnic_groups.groupby('ethnicgroup').agg(np.sum).sort_values(by='population', ascending=False).head(10)

Unnamed: 0_level_0,population
ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [9]:
# solution to problem 4: longest river, largest lake and airport at the highest elevation 
# printing top five instead of just the top candidate 

mydict = {'river' : 'length', 'lake' : 'area', 'airport' : 'elevation'}

def top_five(mytuple):
    '''figure out the top five mytuple[1]s of mytuple[0]s'''
    names = []
    countries = []
    y = []
    for thing in root.findall(mytuple[0]):
        try:
            names.append(thing.find('name').text)
        except:
            names.append(np.nan)
        try:
            countries.append(thing.attrib['country'])
        except:
            countries.append(np.nan)
        try:
            y.append(float(thing.find(mytuple[1]).text))
        except:
            y.append(np.nan)
    mydata = {mytuple[0] : names, mytuple[1] : y, 'countries': countries}
    df = pd.DataFrame(mydata)
    return df.sort_values(by=mytuple[1] ,ascending=False).head().set_index(mytuple[0])

for mytuple in mydict.iteritems():
    print 'top five ' + mytuple[0] + ' ' + mytuple[1] + 's:\n'
    print top_five(mytuple)
    print '\n'

top five airport elevations:

                                       countries  elevation
airport                                                    
El Alto Intl                                 BOL     4063.0
Lhasa-Gonggar                                 CN     4005.0
Yushu Batang                                  CN     3963.0
Juliaca                                       PE     3827.0
Teniente Alejandro Velasco Astete Intl        PE     3311.0


top five river lengths:

         countries  length
river                     
Amazonas  CO BR PE  6448.0
Jangtse         CN  6380.0
Hwangho         CN  4845.0
Lena             R  4400.0
Zaire      RCB ZRE  4374.0


top five lake areas:

                   area       countries
lake                                   
Caspian Sea    386400.0  R AZ KAZ IR TM
Lake Superior   82103.0         CDN USA
Lake Victoria   68870.0     EAT EAK EAU
Lake Huron      59600.0         CDN USA
Lake Michigan   57800.0             USA


