# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

# 10 countries with the lowest infant mortality rates

In [7]:
infant_dict = {}
for element in document.iterfind('country'):
    if element.find('infant_mortality') == None:
        infant_dict[element.find('name').text] = ''
    else: 
        infant_dict[element.find('name').text] = float(element.find('infant_mortality').text)

In [8]:
for w in sorted(infant_dict, key=infant_dict.get, reverse=False)[0:10]:
    print w, infant_dict[w] #country with lowest infant mortality

Monaco 1.81
Japan 2.13
Bermuda 2.48
Norway 2.48
Singapore 2.53
Sweden 2.6
Czech Republic 2.63
Hong Kong 2.73
Macao 3.13
Iceland 3.15


# 10 cities with the largest population

In [9]:
population_city_dict = {}
for city in document.findall("./country/city"):
    name = city.findall('name')[-1]
    population = city.find('population')
    if population == None:
        print "No population for the city"
    else:
        last_population = city.findall('population')[-1]
        population_city_dict[name.text] = float(last_population.text)

    
for city in document.findall("./country/province/city"):
    name = city.findall('name')[-1]
    population = city.find('population')
    if population == None:
        print "No population for the city"
    else:
        last_population = city.findall('population')[-1]
        population_city_dict[name.text] = float(last_population.text)

No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
No population for the city
N

In [10]:
import operator
sorted_cities = sorted(population_city_dict.items(), key=operator.itemgetter(1), reverse= True)
sorted_cities[0:10]

[('Shanghai', 22315474.0),
 ('Istanbul', 13710512.0),
 ('Mumbai', 12442373.0),
 ('Moscow', 11979529.0),
 ('Beijing', 11716620.0),
 (u'S\xe3o Paulo', 11152344.0),
 ('Tianjin', 11090314.0),
 ('Guangzhou', 11071424.0),
 ('Delhi', 11034555.0),
 ('Shenzhen', 10358381.0)]

# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [11]:
#country population dictionary
population_dict = {}
for country in document.findall('country'):
    name = country.find('name')
    population = country.findall('population')[-1]
    population_dict[name.text] = float(population.text)

In [12]:
ethnicity_dict = {}
#initializing to zero all enthinity groups population
for group in document.findall("./country/ethnicgroup"):
    ethnicity_dict[group.text] =  0

#adding the contribution given by all countries    
for country in document.findall('country'):
    population = population_dict[country.find('name').text]
    for group in country.getiterator('ethnicgroup'):
        ethnicity_dict[group.text] += (population*float(group.attrib['percentage']))/100.0

In [13]:
sorted(ethnicity_dict.items(), reverse=True, key=operator.itemgetter(1))[0:10]

[('Han Chinese', 1245058800.0),
 ('Indo-Aryan', 871815583.44),
 ('European', 494872219.71959996),
 ('African', 318325120.369),
 ('Dravidian', 302713744.25),
 ('Mestizo', 157734354.93699998),
 ('Bengali', 146776916.72),
 ('Russian', 131856996.077),
 ('Japanese', 126534212.0),
 ('Malay', 121993550.374)]

# Name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [14]:
import pandas as pd
import numpy as np
river = {'name':[],'country':[],'length':[]}
rivers_df = pd.DataFrame(river)

In [15]:
for river in document.findall("./river"):
    name = river.find('name').text
    if river.find('length') == None:
        length = 0
    else:
        length = float(river.find('length').text)
    #country = river.find('located')
    if river.find('source') == None:
        country = 'Unknown'
    else:
        country = river.find('source').get('country')
    rivers_df = rivers_df.append({'name':name, 'country':country, 'length':length}, ignore_index=True)

In [16]:
#it seems that the Nile(s) don't have associated length
rivers_df[rivers_df['length']==0]

Unnamed: 0,country,length,name
190,SUD,0.0,Nile
193,SSD,0.0,White Nile
198,EAU,0.0,Bahr el-Djebel/Albert-Nil
200,EAU,0.0,Victoria Nile
222,ZRE,0.0,Lualaba


In [17]:
rivers_df.sort_values(by= ['length'], ascending= False)[0:1]

Unnamed: 0,country,length,name
174,PE,6448.0,Amazonas


In [19]:
lakes = {'name':[],'country':[],'area':[]}
lakes_df = pd.DataFrame(lakes)

for lake in document.findall("./lake"):
    name = lake.find('name').text
    if lake.find('area') == None:
        area = 0
    else:
        area= float(lake.find('area').text)
    country = lake.get('country')
    #if river.find('source') == None:
     #   country = 'Unknown'
    #else:
     #   country = river.find('source').get('country')
    lakes_df = lakes_df.append({'name':name, 'country':country, 'area':area}, ignore_index=True)

In [20]:
lakes_df.sort_values(by= ['area'], ascending= False)[0:1]

Unnamed: 0,area,country,name
54,386400.0,R AZ KAZ IR TM,Caspian Sea


In [21]:
airports = {'name':[],'country':[],'elevation':[]}
airports_df = pd.DataFrame(airports)

for airport in document.findall("./airport"):
    name = airport.find('name').text
    elevation = airport.find('elevation').text
    country = airport.get('country')
    airports_df = airports_df.append({'name':name, 'country':country, 'elevation':elevation}, ignore_index=True)

In [22]:
airports_df.sort_values(by= ['elevation'], ascending= False)[0:1]

Unnamed: 0,country,elevation,name
536,IR,995,Mashhad
