# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [31]:
document = ET.parse( './data/mondial_database.xml' )

In [83]:
from IPython.display import display, HTML

# Exercise 1
# We will ignore countries with no mortality rate provided
# First collect countries and their rates and then sort them
mortality_rates = []
for country in document.findall('country'):
    country_name = ""
    country_mortality_rate = None
    try:
        country_name = country.find('name').text
        country_mortality_rate = float(country.find('infant_mortality').text)
        mortality_rates += [[country_name, country_mortality_rate]]
    except:
        pass
    
mortality_rates.sort(key=lambda r: r[1])
tbl = '<table><th>Country</th><th>Rate</th>';
for country, rate in mortality_rates[:10]:
    tbl += "<tr><td>"  + country +  "</td><td>" + format(rate, '.2f') + "%</td>"
display(HTML(tbl + '</table>'))

0,1
Monaco,1.81%
Japan,2.13%
Norway,2.48%
Bermuda,2.48%
Singapore,2.53%
Sweden,2.60%
Czech Republic,2.63%
Hong Kong,2.73%
Macao,3.13%
Iceland,3.15%


In [86]:
# Exercise 2
# We will ignore cities with no population provided
# Also we consider the last measurement of populations in every city
# First collect cities and their population and then sort them
document = ET.parse( './data/mondial_database.xml' )
city_population = []
for city in document.findall('.//city'):
    city_name = ""
    try:
        city_name = city.find('name').text
        pop = max(city.findall('population'), key=lambda x: int(x.attrib['year']))
        city_population += [[city_name, int(pop.text)]]
    except:
        pass
    
city_population.sort(key=lambda r: r[1], reverse=True)
tbl = '<table><th>Country</th><th>Population</th>';
for city, population in city_population[:10]:
   tbl += "<tr><td>" + city + "</td><td>" + format(population, ',d') + "</td></tr>"
display(HTML(tbl + "</table>"))

0,1
Shanghai,22315474
Istanbul,13710512
Mumbai,12442373
Moskva,11979529
Beijing,11716620
São Paulo,11152344
Tianjin,11090314
Guangzhou,11071424
Delhi,11034555
Shenzhen,10358381


In [113]:
# Exercise 3
# For each country we will find the latest population and then all the ethnic groups
# We will find the population of the ethnic group by multiplying latest_population * percentage
# The ethnic groups will be stored in a dictionary

ethnic_group = {}
for country in document.findall('country'):
    country_name = ""
    try:
        country_name = country.find('name').text
        population = int(max(country.findall('population'), key=lambda x: int(x.attrib['year'])).text)
        for ethnicgroup in country.findall('.//ethnicgroup'):
            ethnic_group_population = round(population * (float(ethnicgroup.attrib['percentage'])/100))
            ethnic_group_name = ethnicgroup.text
            if ethnic_group_name not in ethnic_group:
                ethnic_group[ethnic_group_name] = ethnic_group_population
            else:
                ethnic_group[ethnic_group_name] += ethnic_group_population
    except:
        pass

import operator
sorted_groups = sorted(ethnic_group.items(), key=operator.itemgetter(1), reverse=True)

tbl = '<table><th>Ethnic group</th><th>Population</th>';
for en_group, pop in sorted_groups[:10]:
     tbl += "<tr><td>"  + en_group +  "</td><td>" + format(pop, ',.0f') + "</td>"
display(HTML(tbl + '</table>'))

0,1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872221
African,318325122
Dravidian,302713744
Mestizo,157734355
Bengali,146776917
Russian,131856994
Japanese,126534212
Malay,121993550


In [164]:
# Exercise 4
# We detect each river, lake, airport and find their countries
# Then we find the longest river, largest lake airport at highest elevation
# Be careful the Nile doesn't have recorded length in this dataset!

longest_river=('', '', 0.0)
largest_lake=('','', 0.0)
airport_highest_elev=('', '', 0.0)

for river in document.findall('.//river'):
    river_name = ""
    try:
        river_name = river.find('name').text
        river_length = float(river.find('length').text)
        country_codes = river.attrib['country'].split()
        queries = ["./country[@car_code='" + country_code +"']"
                   for country_code in country_codes]
        country_elems = [document.find(query_str) for query_str in queries]
        country_names = [country.find('name').text for country in country_elems]
        country_name = ",".join(country_names)

        if river_length > longest_river[2]:
           longest_river = (country_name, river_name, river_length)
    except:
        pass

for lake in document.findall('.//lake'):
    lake_name = ""
    try:
        lake_name = lake.find('name').text
        lake_length = float(lake.find('area').text)
        country_codes = lake.attrib['country'].split()
        queries = ["./country[@car_code='" + country_code +"']"
                   for country_code in country_codes]
        country_elems = [document.find(query_str) for query_str in queries]
        country_names = [country.find('name').text for country in country_elems]
        country_name = ",".join(country_names)

        if lake_length > largest_lake[2]:
           largest_lake = (country_name, lake_name, lake_length)
    except:
        pass

for airport in document.findall('.//airport'):
    airport_name = ""
    try:
        airport_name = airport.find('name').text
        airport_elev = float(airport.find('elevation').text)
        country_codes = airport.attrib['country'].split()
        queries = ["./country[@car_code='" + country_code +"']"
                   for country_code in country_codes]
        country_elems = [document.find(query_str) for query_str in queries]
        country_names = [country.find('name').text for country in country_elems]
        country_name = ",".join(country_names)

        if airport_elev > airport_highest_elev[2]:
           airport_highest_elev = (country_name, airport_name, airport_elev)
    except:
        pass
    
print("Longest River: " + str(longest_river))
print("Largest Lake: " + str(largest_lake))
print("Airport with highest elevation: " + str(airport_highest_elev))

Longest River: ('Colombia,Brazil,Peru', 'Amazonas', 6448.0)
Largest Lake: ('Russia,Azerbaijan,Kazakhstan,Iran,Turkmenistan', 'Caspian Sea', 386400.0)
Airport with highest elevation: ('Bolivia', 'El Alto Intl', 4063.0)
