# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [6]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [76]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [8]:
import pandas as pd

In [10]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [11]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


In [77]:
document_root = document_tree.getroot()

In [21]:
for child in document_root:
    print (child.tag)

country
country
country
country
country
country
country


In [43]:
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    infant_mortal = ''
    for subelement in element.getiterator('infant_mortality'):
        infant_mortal += subelement.text + ', '
    print (infant_mortal[:-2]) #this is just to get rid of the comma at the end, so if I'm not doing a list, 
                                #just git rid of the comma in the previous line

* Albania:
13.19
* Greece:
4.78
* Macedonia:
7.9
* Serbia:
6.16
* Montenegro:

* Kosovo:

* Andorra:
3.69


In [69]:
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    infant_mortal = 1000   
    for subelement in element.getiterator('infant_mortality'):
        infant_mortal = float(subelement.text)
    print (infant_mortal)

* Albania:
13.19
* Greece:
4.78
* Macedonia:
7.9
* Serbia:
6.16
* Montenegro:
1000
* Kosovo:
1000
* Andorra:
3.69


In [99]:
document_root[1].attrib

{'area': '131940',
 'capital': 'cty-Greece-Athens',
 'car_code': 'GR',
 'memberships': 'org-AG org-BIS org-BSEC org-CD org-SELEC org-CE org-EMU org-EAPC org-EBRD org-ECB org-EIB org-CERN org-ESA org-EU org-FATF org-FAO org-IGAD org-IPU org-IAEA org-IBRD org-ICC org-ICAO org-ICJ org-ICCt org-Interpol org-IDA org-IEA org-IFRCS org-IFC org-IFAD org-IHO org-ILO org-IMO org-IMSO org-IMF org-IOC org-IOM org-OIF org-ITSO org-ITU org-ITUC org-MIGA org-NATO org-NEA org-NSG org-OECD org-OSCE org-OPCW org-OAS org-PCA org-UN org-UNCTAD org-UNESCO org-UNHCR org-UNIDO org-UNIFIL org-UPU org-WCO org-WFTU org-WHO org-WIPO org-WMO org-UNWTO org-WTO org-ZC'}

In [143]:
for child in document_root[1]:#[1:11]: #seeing the children under the main elements
    print(child.tag)

name
localname
population
population
population
population
population
population
population
population
population
population
population
population
population
population
population
population_growth
infant_mortality
gdp_total
gdp_agri
gdp_ind
gdp_serv
inflation
unemployment
indep_date
government
encompassed
ethnicgroup
religion
religion
language
border
border
border
border
province
province
province
province
province
province
province
province
province
province
province
province
province
province


In [140]:
for child in document_root[0]:#[1:11]: #seeing the children under the main elements
    print(child.attrib)

{}
{'measured': 'est.', 'year': '1950'}
{'measured': 'est.', 'year': '1960'}
{'measured': 'est.', 'year': '1970'}
{'measured': 'est.', 'year': '1980'}
{'measured': 'est.', 'year': '1990'}
{'year': '1997'}
{'measured': 'est.', 'year': '2000'}
{'year': '2001', 'measured': 'census'}
{'year': '2011', 'measured': 'census'}
{}
{}
{}
{}
{}
{}
{}
{}
{'from': 'Ottoman Empire'}
{}
{'continent': 'europe', 'percentage': '100'}
{'percentage': '95'}
{'percentage': '3'}
{'percentage': '70'}
{'percentage': '10'}
{'percentage': '20'}
{'percentage': '98.8'}
{'percentage': '0.5'}
{'country': 'GR', 'length': '282'}
{'country': 'MK', 'length': '151'}
{'country': 'MNE', 'length': '172'}
{'country': 'KOS', 'length': '112'}
{'id': 'cty-Albania-Tirane', 'country': 'AL'}
{'id': 'stadt-Shkoder-AL-AL', 'country': 'AL'}
{'id': 'stadt-Durres-AL-AL', 'country': 'AL'}
{'id': 'stadt-Vlore-AL-AL', 'country': 'AL'}
{'id': 'stadt-Elbasan-AL-AL', 'country': 'AL'}
{'id': 'stadt-Korce-AL-AL', 'country': 'AL'}


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

# Answer 1

Finding the 10 lowest infnant mortality rates and the countries associated with them

In [68]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

In [9]:
document = ET.parse( './data/mondial_database.xml' )

In [10]:
root = document.getroot()

In [11]:
# find all the infant mortality rates and sort them.
inf_list = []
for country in document.iterfind('country'):
   
    infant_mortal = 1000 #since the 10 lowest are wanted, this prevents missing data from appearing as the lowest (0)
    
    for infmor in country.getiterator('infant_mortality'):
        infant_mortal = float(infmor.text)
        inf_list = inf_list + [infant_mortal]
list.sort(inf_list)
short_inf_list = inf_list[:10]

In [13]:
a=root.findall("./country[infant_mortality='1.81']/name") # finding the right syntax to call the name of the country
a[0].text                                                 # corresponding to a mortatlity rate

'Monaco'

In [14]:
root.findall("./country[infant_mortality='1.81']/name")[0].text

'Monaco'

In [15]:
d = "./country[infant_mortality='1.81']/name"
root.findall(d)[0].text

'Monaco'

In [16]:
#Answer 1!
for entry in short_inf_list:
    d =  "./country[infant_mortality='"+ str(entry)+"']/name"
    print (root.findall(d)[0].text +  ': ' + str(entry))
    

Monaco: 1.81
Japan: 2.13
Norway: 2.48
Norway: 2.48
Singapore: 2.53
Sweden: 2.6
Czech Republic: 2.63
Hong Kong: 2.73
Macao: 3.13
Iceland: 3.15


# Answer 2

Finding the top 10 cities by population

In [None]:
#Some exploration of the data
for child in document_root[1]:#[1:11]: #seeing the children under the main elements
    print(child.tag)

In [None]:
for child in document_root[0]:#[1:11]: #seeing the children under the main elements
    print(child.attrib)

In [17]:
cityname = ""
citypopulation = ""
document = ET.parse( './data/mondial_database.xml' ) #NOTE this line is not technically necessary as it is already defined, 
                                                        #but it makes this answer independent of the previous one
df = pd.DataFrame(columns=['CityName','Population']) #create data frame to hold country name and its popuplation
df['Population'] = df['Population'].astype(float)

#loop through country element to find city name and its population
for country in document.iterfind( 'country' ):
    for city in country.iter('city'): #find all cities within each country element
        cityname = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'): #find all population elements with each city
            year = node.attrib['year'] #there are multiple population elements with different 'year' attribute
            if node.attrib['year'] >= year: #store the population number of the latest year
                citypopulation = int(node.text)
        df.loc[len(df)] = [cityname,citypopulation] #add city name and its population to data frame
        cityname = ""
        

#sort data frame to find 10 cities with largest population
df.sort_values(by = 'Population', ascending=False).head(10)

Unnamed: 0,CityName,Population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


# Answer 3

Finding the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [18]:
countryname = ""            #same pattern as last answer, creating a dataframe to store results
countrypopulation = 0
ethnicpopulation = 0
document = ET.parse( './data/mondial_database.xml' ) #NOTE this line is not technically necessary as it is already defined, 
df = pd.DataFrame(columns=['Country','EthnicGroup','Population']) #but it does make the answer independent of previous ones
df['Population'] = df['Population'].astype(float)

#loop through country element to find ethnic groups and its population
for country in document.iterfind( 'country' ):
    countryname = country.find('name').text #find country name
    year = int(0)
    countrycpopulation = int(0)
    for pop in country.iterfind('population'): #find population of the country
        year = pop.attrib['year']
        if pop.attrib['year'] >= year: #find population of the latest year
            countrypopulation = (pop.text) 
    ethnicname = None
    ethnicpopulation = 0      #presetting values
    for ethnic in country.iter('ethnicgroup'): #find all ethnic groups within the same country
        ethnicname = ethnic.text
        #earlier exploration showed that ethnic populations are recorded as percentages
        #compute each ethnic population: country population * ethnic group percentage 
        ethnicpopulation = round(float(ethnic.attrib['percentage']) * 0.01 * int(countrypopulation))
        if ethnicname == None:
            ethnicname = countryname
            ethnicpopulation = countrypopulation
        df.loc[len(df)] = [countryname,ethnicname,ethnicpopulation] #store ethnic group population to data frame
    countryname = ""



In [19]:
#group ethnic group across all countries and sum them up to find top 10 ethnic groups and its total population
df.groupby('EthnicGroup').sum().sort_values(by = 'Population', ascending=False).head(10)

Unnamed: 0_level_0,Population
EthnicGroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


# Answer 4

Name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [107]:
#longest river
rivername = ""
rivercountry = ""
lengthtemp = 0
riverlength = 0
document = ET.parse( './data/mondial_database.xml' ) #NOTE this line is not technically necessary as it is already defined, 
                                                        #but it makes this answer independent of the previous one

#loop through document to find all rivers

for river in root.iter('river'): #find all rivers. Note that rivers are their own elements outside of countries
    
    rname = river.find('length')
    
    if rname != None:
        lengthtemp = rname.text
        lengthtemp = float(lengthtemp)
        if lengthtemp > riverlength:
            riverlength = lengthtemp
            rivername = river.find('name').text
            rivcount = river.attrib['country']
            rivercountry = rivcount.split(' ')

In [108]:
printlist = []
for entry in range(len(rivercountry)):
    d =  "./country[@car_code='"+ rivercountry[entry]+"']/name"
    printlist = printlist+ [root.findall(d)[0].text]


In [109]:
#Answer 4.a
print('The longest river is ' + rivername + ' at ' + str(riverlength)+ 'km. It runs through ' + printlist[0] +', '+ printlist[2]+ ', and '+ printlist[1])

The longest river is Amazonas at 6448.0km. It runs through Colombia, Peru, and Brazil


In [111]:
#largest lake
lakename = ""
lakecountry = ""
lakearea = 0
temparea=0
document = ET.parse( './data/mondial_database.xml' ) #NOTE this line is not technically necessary as it is already defined, 
                                                        #but it makes this answer independent of the previous one


#loop through document to find lakes
for lake in root.iter('lake'): #find all lakes 
    
    lname = lake.find('area')
    
    if lname != None:
        areatemp = lname.text
        areatemp = float(areatemp)
        if areatemp > lakearea:
            lakearea = areatemp
            lakename = lake.find('name').text
            lakecount = lake.attrib['country']
            lakecountry = lakecount.split(' ')

In [113]:
printlist = []
for entry in range(len(lakecountry)):
    d =  "./country[@car_code='"+ lakecountry[entry]+"']/name"
    printlist = printlist+ [root.findall(d)[0].text]

In [115]:
#Answer 4.b
print('The largest lake is ' + lakename + ' at ' + str(lakearea)+ ' sq.km. It is in ' + printlist[0] + ', ' + printlist[2]+ ', ' + printlist[3] + ', ' + printlist[4] + ', and '+ printlist[1])

The largest lake is Caspian Sea at 386400.0sq.km. It is in Russia, Kazakhstan, Iran, Turkmenistan, and Azerbaijan


In [132]:
# highest airport
airname = ""
aircountry = ""
airel = 0

document = ET.parse( './data/mondial_database.xml' ) #NOTE this line is not technically necessary as it is already defined, 
                                                        #but it makes this answer independent of the previous one

#loop through document to find airport
for ap in root.iter('airport'): #find all airports
    
    air = ap.find('elevation').text
    
    if air != None:
        air = float(air)
        if air > airel:
            airel = air
            airname = airport.find('name').text
            aircount = airport.attrib['country']
            aircountry = aircount.split(' ')

In [135]:
d =  "./country[@car_code='"+ aircountry[0] +"']/name"
printlist = [root.findall(d)[0].text]

In [137]:
#Answer 4.c
print('The highest airport is ' + airname + ' at ' + str(airel) +' m. It is in ' + printlist[0] + '.')

The highest airport is Xiangfan Airport at 4063.0 m. It is in China.
