# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML Exercises

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
# Open Document
document = ET.parse( './data/mondial_database.xml' )
# document
# use this library to sort a dictionary by a specific key
from operator import itemgetter
# use this library to change convert accented characters into ascii
import unicodedata
import ast

# 10 countries with lowest infant mortality rates

In [6]:
countries = document.findall(".//country")

In [7]:
countries_infantMortality_list = [(t.find('name').text, float(t.find('infant_mortality').text)) 
                                  for t in countries if t.find('infant_mortality') is not None]
#print countries_infantMortality_list
print countries_infantMortality_list[0:10]

[('Albania', 13.19), ('Greece', 4.78), ('Macedonia', 7.9), ('Serbia', 6.16), ('Andorra', 3.69), ('France', 3.31), ('Spain', 3.33), ('Austria', 4.16), ('Czech Republic', 2.63), ('Germany', 3.46)]


In [8]:
#sorted_infantMortality_list = 
countries_infantMortality_list.sort(key = itemgetter(1))
countries_infantMortality_list[0:10]

[('Monaco', 1.81),
 ('Japan', 2.13),
 ('Norway', 2.48),
 ('Bermuda', 2.48),
 ('Singapore', 2.53),
 ('Sweden', 2.6),
 ('Czech Republic', 2.63),
 ('Hong Kong', 2.73),
 ('Macao', 3.13),
 ('Iceland', 3.15)]

# 10 cities with the largest population


In [9]:
cities = document.findall(".//city")

In [10]:
# creating list with cities and their population based on 2011 population
cities_list = list()
for t in cities:
    for i in t.findall('population'):
        # use the most current year, if possible
        if i.get('year') == '2011':
            cities_list.append((t.find('name').text, int(i.text)))
            break
        elif i.get('year') == None:
            # capture population even if no year tags are present
            cities_list.append((t.find('name').text, int(i.text)))
            break
        else:
            # compare the remaining years, if most current not present, and choose largest value
            max_pop = int(max(t.iterfind('population'), key=lambda p: int(p.attrib['year'])).text)
            cities_list.append((t.find('name').text, max_pop))
            break

In [11]:
cities_list.sort(key = itemgetter(1), reverse=True)
cities_list[0:10]

[('Shanghai', 22315474),
 ('Istanbul', 13710512),
 ('Mumbai', 12442373),
 ('Moskva', 11979529),
 ('Beijing', 11716620),
 (u'S\xe3o Paulo', 11152344),
 ('Tianjin', 11090314),
 ('Guangzhou', 11071424),
 ('Delhi', 11034555),
 ('Shenzhen', 10358381)]

# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [12]:
ethnic_list = list()
for country in countries:
    # extract each country population
    country_pop = 0
    for j in country.findall('population'):
        if j.get('year') == '2011':
            country_pop = float(j.text)
    # append each ethnic group's percentage to list
    for e_group in country.findall('ethnicgroup'):
        ethnic_list.append((e_group.text, float(e_group.get('percentage')) * country_pop / 100))
#ethnic_list

In [13]:
# Find unique ethnic group names using set function
unique_ethnic_list = set([i for [i,j] in ethnic_list])
#unique_ethnic_list

In [14]:
# Find the sum of each ethnic group
ethnic_group_sums = list()
for i in unique_ethnic_list:
    population_list = [float(y) for (x,y) in ethnic_list if x == i]
    #print i, population_list
    ethnic_group_sums.append((i, int(sum(population_list))))
#ethnic_group_sums

In [15]:
# Find 10 largest ethnic groups
ethnic_group_sums.sort(key = itemgetter(1), reverse=True)
ethnic_group_sums[0:10]

[('Indo-Aryan', 871815583),
 ('Dravidian', 302713744),
 ('African', 166391983),
 ('Bengali', 146776916),
 ('German', 74278485),
 ('English', 52820300),
 ('Mediterranean Nordic', 46815916),
 ('Persian', 38326331),
 ('Polish', 38018419),
 ('Mongol', 36325649)]

# Name and Country of a) longest river, b) largest lake and c) airport at highest elevation

# Longest River

In [16]:
river_name = None
river_country = None
river_length = 0
# Extract into a data frame, then sort using a pandas function
for river in document.iterfind('river'):
    for length in river.iterfind('length'):
        # not sure why you have to type-cast, but you do in order to get the correct values
        if river_length < float(length.text):
            river_length = float(length.text)
            river_country= river.attrib['country']
            river_name = river.findtext('name')
print(river_name, river_country, river_length)

('Amazonas', 'CO BR PE', 6448.0)


# Largest Lake

In [17]:
lake_name = None
lake_country = None
lake_area = 0
for lake in document.iterfind('lake'):
    for area in lake.iterfind('area'):
        if lake_area < float(area.text):
            lake_area=float(area.text)
            lake_country= lake.attrib['country']
            lake_name = lake.findtext('name')

print(lake_name, lake_country, lake_area)

('Caspian Sea', 'R AZ KAZ IR TM', 386400.0)


In [18]:
#Example of trying to run with one loop failed.
#for t in lakes:
#    if float(lake_area) < float(t.find('area').text):
#        lake_name = t.find('name').text
#        lake_country = t.attrib['country']
#        lake_area = float(t.find('area').text)

# Highest Elevated Airport

In [19]:
airport_name = None
airport_country = None
airport_elevation = 0
for airport in document.iterfind('airport'):
    for elevation in airport.iterfind('elevation'):
        # check for null values, as some airports apparently didn't list their elevation
        if (elevation.text is not None) and (airport_elevation < int(elevation.text)):
            airport_elevation = int(elevation.text)
            airport_country= airport.attrib['country']
            airport_name = airport.findtext('name')
print(airport_name, airport_country, airport_elevation)

('El Alto Intl', 'BOL', 4063)
