# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [5]:
from xml.etree import ElementTree as ET


## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [6]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [18]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [19]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [27]:
document = ET.parse( './data/mondial_database.xml' )

In [28]:
root = document.getroot()

## Part 1: 10 countries with the lowest infant mortality rates

In [44]:
inf_mort = {}

for country in root.findall('country'):
    if country.find('infant_mortality') != None:
        name = country.find('name').text
        im = country.find('infant_mortality').text
    inf_mort[name] = float(im)
    
i = 1
for key in sorted(inf_mort, key=inf_mort.get)[:10]:
    print(str(i) + '. ' + key + ": " + str(inf_mort[key]))
    i+=1

1. Monaco: 1.81
2. Japan: 2.13
3. Norway: 2.48
4. Bermuda: 2.48
5. Singapore: 2.53
6. Sweden: 2.6
7. Czech Republic: 2.63
8. Hong Kong: 2.73
9. Macao: 3.13
10. Iceland: 3.15


## Part 2: 10 cities with the largest population

In [49]:
city_pop = {}

for city in root.iter('city'):
    name = city.find('name').text
    for pop in city.findall('population'):
        if name not in city_pop or city_pop[name] < int(pop.text):
            city_pop[name] = int(pop.text)

i = 1
for key in sorted(city_pop, key=city_pop.get, reverse=True)[:10]:
    print(str(i) + '. ' + key + ": " + str(city_pop[key]))
    i+=1

1. Shanghai: 22315474
2. Istanbul: 13710512
3. Delhi: 12877470
4. Mumbai: 12442373
5. Moskva: 11979529
6. Beijing: 11716620
7. São Paulo: 11152344
8. Tianjin: 11090314
9. Guangzhou: 11071424
10. Shenzhen: 10358381


## Part 3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [61]:
eth_pop = {}

for country in root.findall('country'):
    
    c_pop = 0
    latest_year = 0
    
    #get latest population estimate
    for pop in country.findall('population'):
        pop_year = int(pop.attrib['year'])
        if pop_year > latest_year:
            latest_year = pop_year
            c_pop = int(pop.text)
            
    #get ethnic group percentage and multiply by population
    for eth in country.findall('ethnicgroup'):
        eth_per = float(eth.attrib['percentage']) / 100
        if eth.text in eth_pop:
            eth_pop[eth.text] += eth_per * c_pop
        else:
            eth_pop[eth.text] = eth_per * c_pop

i = 1
for key in sorted(eth_pop, key=eth_pop.get, reverse=True)[:10]:
    print(str(i) + '. ' + key + ": " + str(eth_pop[key]))
    i+=1

1. Han Chinese: 1245058800.0
2. Indo-Aryan: 871815583.4399999
3. European: 494872219.7195999
4. African: 318325120.36899996
5. Dravidian: 302713744.25
6. Mestizo: 157734354.93699998
7. Bengali: 146776916.72
8. Russian: 131856996.077
9. Japanese: 126534212.00000001
10. Malay: 121993550.374


## Part 4: name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [64]:
import pandas as pd

In [87]:
#part a

countries = []
names = []
lengths = []

for river in root.findall('river'):
    country, name, length = '','',0
    if river.find('length') != None:
        country = river.attrib['country']
        name = river.find('name').text
        length = float(river.find('length').text)
    
    if country != '' and name != '' and length != 0:
        countries.append(country)
        names.append(name)
        lengths.append(length)

d = {'country' : countries,
     'river_name' : names,
     'river_length' : lengths}

dfr = pd.DataFrame(d)
dfr = dfr.sort_values(by='river_length', ascending=False)
dfr.head(1)


Unnamed: 0,country,river_length,river_name
174,CO BR PE,6448.0,Amazonas


In [101]:
for lake in root.findall('lake'):
    l = lake.find('area')

In [112]:
#part b

countries = []
names = []
areas = []

for lake in root.findall('lake'):
    country, name, area = '','', 0.0
    if lake.find('area') != None:
        country = lake.attrib['country']
        name = lake.find('name').text
        area = float(lake.find('area').text)
    
    if country != '' and name != '' and area != 0.0:
        countries.append(country)
        names.append(name)
        areas.append(area)

d = {'country' : countries,
     'lake_name' : names,
     'lake_area' : areas}

dfl = pd.DataFrame(d)
dfl = dfl.sort_values(by='lake_area', ascending=False)
dfl.head(1)


Unnamed: 0,country,lake_area,lake_name
54,R AZ KAZ IR TM,386400.0,Caspian Sea


In [126]:
#part c

countries = []
names = []
elevations = []

for airport in root.findall('airport'):
    country, name, elevation = '','', 0
    if airport.find('elevation').text != None:
        country = airport.attrib['country']
        name = airport.find('name').text
        elevation = int(airport.find('elevation').text)
    
    if country != '' and name != '' and area != 0:
        countries.append(country)
        names.append(name)
        elevations.append(elevation)
        
d = {'country' : countries,
     'airport_name' : names,
     'airport_elevation' : elevations}

dfa = pd.DataFrame(d)
dfa = dfa.sort_values(by='airport_elevation', ascending=False)
dfa.head(1)



Unnamed: 0,airport_elevation,airport_name,country
80,4063,El Alto Intl,BOL
