# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [20]:
from xml.etree import ElementTree as ET

In [22]:
import xml.etree.ElementTree as ET
tree = ET.parse('data/mondial_database_less.xml')
root = tree.getroot()
root

<Element 'mondial' at 0x1043fd688>

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [200]:
document_tree = ET.parse( 'data/mondial_database_less.xml' )

In [201]:
document_tree
root = document_tree.getroot()

In [202]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)


Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [203]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [133]:
import numpy as np
import pandas as pd

In [197]:
document = ET.parse( './data/mondial_database.xml' )

In [261]:
# print names of all countries
i = 0
nation = pd.Series('')
infant_mortality = pd.Series(0.0)
for child in document.getroot():
    country = child.find('name').text
    nation[i] = country
    element = child.find('infant_mortality')
    if element is None:
        infant_mortality[i] = float('nan')
    else:
        infant_mortality[i] = float(element.text)
    i += 1

df = pd.concat([nation, infant_mortality], axis=1)

In [263]:
df.sort_values(by = 1, ascending=True).head(10)

Unnamed: 0,0,1
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [266]:
i = 0
city = pd.Series('')
population = pd.Series(0.0)
year = pd.Series(0)
for element in document.iterfind('country'):
#    print('* ' + element.find('name').text + ':')
    for subelement in element.iterfind('.//city'):
        cityNow = subelement.find('name').text
        pop = subelement.findall('population')
        for sub in pop:
#            print('population = ', sub.text)
            population[i] = int(sub.text)
            year[i] = int(sub.attrib['year'])
            city[i] = cityNow
            i += 1
df = pd.concat([city, population, year], axis=1)

In [276]:
df.groupby(0).max().sort_values(by = 1, ascending=False).head(10)

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
Shanghai,22315474.0,2010
Istanbul,13710512.0,2012
Delhi,12877470.0,2011
Mumbai,12442373.0,2011
Moskva,11979529.0,2013
Beijing,11716620.0,2010
São Paulo,11152344.0,2010
Tianjin,11090314.0,2010
Guangzhou,11071424.0,2010
Shenzhen,10358381.0,2010


In [385]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#pd.set_option('precision',15)
i = 0
ethnicities = pd.Series('')
ethnicPops = pd.Series(0)
for element in document.iterfind('country'):
#    print(element.find('name').text + ':')
    for subelement in element.iterfind('./population'):
        latestPop = subelement.text
#    print('pop:', latestPop)
    for subelement in element.iterfind('./ethnicgroup'):
        ethnicity = subelement.text
        ethnicityPercentage = subelement.attrib['percentage']
        ethnicityPopulation = int(latestPop) * float(subelement.attrib['percentage']) / 100
#        print('**** ', ethnicity, ethnicityPercentage + '%', ethnicityPopulation)
        ethnicities[i] = ethnicity
        ethnicPops[i] = int(ethnicityPopulation)
        i += 1
df = pd.concat([ethnicities, ethnicPops], axis=1)

In [391]:
df.groupby(0).sum().sort_values(by=1, ascending=False).head(10)

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


In [390]:
df.groupby(0).sum().sum()#.sort_values(by=1, ascending=False).head(10)

1    5960384710
dtype: int64

In [441]:
longestRiverNameTxt = ""
longestRiverLengthNum = 0.0
longestRiverCountryCode = ""
for element in document.iterfind('river'):
    riverLength = element.find('length')
    if riverLength is None:
        riverLengthNum = float('NaN')
    else:
        riverLengthNum = float(riverLength.text)
    riverNameTxt = element.find('name').text
    riverCountryCode = element.attrib['country']
    if riverLengthNum > longestRiverLengthNum:
        longestRiverLengthNum = riverLengthNum
        longestRiverNameTxt = riverNameTxt
        longestRiverCountryCode = riverCountryCode
print('*LongestRiver* ' + longestRiverNameTxt, longestRiverLengthNum, longestRiverCountryCode)

largestLakeNameTxt = ""
largestLakeAreaNum = 0.0
largestLakeCountryCode = ""
for element in document.iterfind('lake'):
    lakeArea = element.find('area')
    if lakeArea is None:
        lakeAreaNum = float('NaN')
    else:
        lakeAreaNum = float(lakeArea.text)
    lakeNameTxt = element.find('name').text
    lakeCountryCode = element.attrib['country']
    if lakeAreaNum > largestLakeAreaNum:
        largestLakeAreaNum = lakeAreaNum
        largestLakeNameTxt = lakeNameTxt
        largestLakeCountryCode = lakeCountryCode
print('*Largest Lake* ' + largestLakeNameTxt, largestLakeAreaNum, largestLakeCountryCode)

highestAirportNameTxt = ""
highestAirportElevationNum = 0.0
highestAirportCountryCode = ""
for element in document.iterfind('airport'):
    airportElevation = element.find('elevation')
    if airportElevation.text == None:
        airportElevationNum = float('NaN')
    else:
        airportElevationNum = float(airportElevation.text)
    airportNameTxt = element.find('name').text
    airportCountryCode = element.attrib['country']
    if airportElevationNum > highestAirportElevationNum:
        highestAirportElevationNum = airportElevationNum
        highestAirportNameTxt = airportNameTxt
        highestAirportCountryCode = airportCountryCode
print('*Highest Airport* ' + highestAirportNameTxt, highestAirportElevationNum, highestAirportCountryCode)

*LongestRiver* Amazonas 6448.0 CO BR PE
*Largest Lake* Caspian Sea 386400.0 R AZ KAZ IR TM
*Highest Airport* El Alto Intl 4063.0 BOL
