# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [7]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [28]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('name'):
        capitals_string += subelement.text + ', '
    print(capitals_string[:-2])

* Albania:
Albania, Tirana, Tirane, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Greece, Anatolikis Makedonias kai Thrakis, Komotini, Kavala, Attikis, Athina, Peiraias, Piräus, Peristeri, Acharnes, Dytikis Elladas, Patra, Patras, Dytikis Makedonias, Kozani, Ionion Nison, Kerkyra, Korfu, Ipeiroy, Ioannina, Kentrikis Makedonias, Thessaloniki, Kritis, Iraklio, Iraklion, Chania, Notioy Aigaioy, Ermoupoli, Rhodes, Rhodos, Peloponnisos, Tripoli, Stereas Elladas, Lamia, Chalkida, Chalkis, Thessalias, Larissa, Volos, Boreioy Aigaioy, Mytilini, Aghion Oros, Karyes
* Macedonia:
Macedonia, Skopje, Kumanovo
* Serbia:
Serbia, Beograd, Novi Sad, Niš
* Montenegro:
Montenegro, Podgorica
* Kosovo:
Kosovo, Prishtine, Pristina
* Andorra:
Andorra, Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [83]:
document = ET.parse( './data/mondial_database.xml' )

In [113]:
import pandas as pd

In [114]:
#Question 1: Find the 10 countries with the lowest infant mortality rates

#Initialize dictionary
mortRates = {}

#Iterate through element tree to extract country names and infant mortality rates. Assign mortality rates to each country in the dictionary
for element in document.iterfind('country'):
    mortRates[element.findtext('name')] = element.findtext('infant_mortality')

#Convert dictionary to dataframe and remove any rows without mortality information
mortRatesDF = pd.DataFrame({'Infant Mortality': mortRates})
mortRatesDF = mortRatesDF.dropna()

#Sort by infant mortality rates and return 10 lowest values
sortedMortRates = mortRatesDF.sort_values(by = ['Infant Mortality'])
sortedMortRates[0:10]

Unnamed: 0,Infant Mortality
Monaco,1.81
Romania,10.16
Fiji,10.2
Brunei,10.48
Grenada,10.5
Mauritius,10.59
Panama,10.7
Seychelles,10.77
United Arab Emirates,10.92
Barbados,10.93


In [168]:
#Question 2: Find the 10 cities with the largest populations

#Initialize dictionary
cityPop = {}

#Iterate through element tree to extract city names and populations. Assign population to each city in the dictionary    
for country in document.iterfind('country'):
    for city in country.iterfind('city'):
        city_name = city.findtext('name')
        for popData in city.iterfind('population'):
            population = popData        
        cityPop[city_name] = population.text
        
#Convert dictionary to dataframe and convert populations to floats
cityPopDF = pd.DataFrame({'City Population': cityPop})
cityPopDF['City Population'] = cityPopDF['City Population'].apply(pd.to_numeric)

#Sort by population and return 10 highest values
sortedCityPop = cityPopDF.sort_values(by = ['City Population'], ascending = False)
sortedCityPop[0:10]


Unnamed: 0,City Population
Seoul,9708483
Al Qahirah,8471859
Bangkok,7506700
Hong Kong,7055071
Macao,7055071
Ho Chi Minh,5968384
Singapore,5076700
Al Iskandariyah,4123869
New Taipei,3939305
Busan,3403135


In [185]:
#Question 3: Find the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

#Initialize dictionary
ethnicGrp = {}

#Iterate through element tree to extract country population   
for country in document.iterfind('country'):
    for population in city.iterfind('population'):
         totalPopulation = int(population.text)
            
#Continue iterating through the country's ethnic groups and calculate ethnic population from ethnic % and total population
    for ethnicGroup in country.iterfind('ethnicgroup'):
        groupName = country.findtext('ethnicgroup')
        percentage = float(ethnicGroup.get('percentage'))
        ethnicPop = percentage * totalPopulation
        if groupName in ethnicGrp.keys():
            ethnicGrp[groupName] += ethnicPop
        else:
            ethnicGrp[groupName] = ethnicPop

#Convert dictionary to dataframe
ethnicGrpDF = pd.DataFrame({'Ethnic Population': ethnicGrp})

#Sort by population and return 10 highest values
sortedEthnicPop = ethnicGrpDF.sort_values(by = ['Ethnic Population'], ascending = False)
sortedEthnicPop[0:10]

Unnamed: 0,Ethnic Population
European,40669887.5
African,29342247.0
Chinese,27479485.0
Mestizo,26812786.0
Arab,15913381.0
Polynesian,14594965.0
Jewish,12452539.0
Indian,9610953.0
Black,9291337.0
Armenian,7453545.0


In [189]:
#Question 4: Find the name and country of a) longest river, b) largest lake and c) airport at highest elevation
#Part a
#Initialize length of zero to begin comparisons
maxLength = 0

#Iterate through all rivers
for river in document.iterfind('river'):
    name = river.findtext('name')
    country = river.get('country')
    try: 
        length = float(river.findtext('length'))
    except: 
        pass
    if length > maxLength:
        maxLength = length
        longestRiver = name
        countryWithRiver = country
countryWithRiver, longestRiver

('CO BR PE', 'Amazonas')

In [191]:
#Part b
#Initialize area of zero to begin comparisons
maxArea = 0

#Iterate through all lakes
for lake in document.iterfind('lake'):
    name = lake.findtext('name')
    country = lake.get('country')
    try: 
        area = float(lake.findtext('area'))
    except: 
        pass
    if area > maxArea:
        maxArea = area
        largestLake = name
        countryWithLake = country
countryWithLake, largestLake

('R AZ KAZ IR TM', 'Caspian Sea')

In [192]:
#Part c
#Initialize elevation of zero to begin comparisons
maxElevation = 0

#Iterate through all airports
for airport in document.iterfind('airport'):
    name = airport.findtext('name')
    country = airport.get('country')
    try: 
        elevation = float(airport.findtext('elevation'))
    except: 
        pass
    if elevation > maxElevation:
        maxElevation = elevation
        highestAirport = name
        countryWithAirport = country
countryWithAirport, highestAirport

('BOL', 'El Alto Intl')