# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [6]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [8]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [10]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [11]:
document = ET.parse( './data/mondial_database.xml' )

### 1. 10 countries with lowest infant mortality rates

In [12]:
# Create a dataframe with country name and infant mortality
df1 = pd.DataFrame(columns=['country','infant_mortality'])
i = 0
for child in document.getroot():
    df1.loc[i,'country'] = child.find('name').text
    try:
        df1.loc[i,'infant_mortality'] = child.find('infant_mortality').text
    except:
        pass
    i += 1

In [13]:
df1.country.count()

2761

In [14]:
df1.infant_mortality.count()

228

Data is available for only 228 countries. Let's get the top 10

In [15]:
#drop extra rows (not really necesary)
#df1 = df1.dropna(how='any')
#change datatype
df1['infant_mortality'] = df1['infant_mortality'].astype('float')
#sort
df1.sort_values('infant_mortality',inplace=True)
df1.head(10)

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


### 2. 10 cities with the largest population

In [16]:
# Create a dataframe with cities and population
df2 = pd.DataFrame(columns=['city','population'])
i = 0

for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        df2.loc[i,'city'] = subelement.find('name').text

        try:
            #Select a year for consistency. Using a range here
            #if int(subelement.find('population').attrib['year']) > 1910:
            df2.loc[i,'population'] = subelement.find('population').text
        except:
            pass
        i += 1

In [17]:
df2 = df2.dropna(how='any')
df2['population'] = df2['population'].astype('int')
#Top 10
df2.sort_values(by='population',ascending=False).head(10)

Unnamed: 0,city,population
1928,Seoul,10229262
1527,Mumbai,9925891
2810,São Paulo,9412894
1757,Jakarta,8259266
1341,Shanghai,8205598
2109,Ciudad de México,8092449
479,Moskva,8010954
1876,Tokyo,7843000
1340,Beijing,7362426
1582,Delhi,7206704


### 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [18]:
### Create a dataframe with country's name, population, ethnic group and percentage

df3 = pd.DataFrame(columns=['country','population','year','ethnicgroup','pct'])
i = 0
for child in document.getroot():
    country = child.find('name').text
    try:
        #Find the most recent year for which the record exists
        l = child.findall('./population')
        yearlist = [int(j.attrib['year']) for j in l]
        yearlist.sort()
        year = str(yearlist[-1])
        
        # Use the same to fetch population record
        population = child.find("./population[@year='" + year + "']").text
    except:
        pass
    
    #Get to populating the dataframe
    l = child.findall('./ethnicgroup')
    for item in l:
        df3.loc[i,'country'] = country
        df3.loc[i,'population'] = int(population)
        df3.loc[i,'year'] = int(year)
        df3.loc[i,'ethnicgroup'] = item.text
        df3.loc[i,'pct'] = float(item.attrib['percentage'])
        i += 1

In [19]:
df3['ethic_population'] = df3['population'] * df3['pct'] / 100.0

In [20]:
#Top 10
df3[['ethnicgroup','ethic_population']].groupby('ethnicgroup').sum().sort_values(by='ethic_population',ascending=False).head(10)

Unnamed: 0_level_0,ethic_population
ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


### 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

1 Create dataframe for country and code
2 Table that has river information - name and length
document.findall('river')

In [22]:
### Create a dataframe with country's name, population, ethnic group and percentage

df3 = pd.DataFrame(columns=['country','river','length','lake','area','airport'])
i = 0
for child in document.getroot():
    country = child.find('name').text
    try:
        #Find the most recent year for which the record exists
        l = child.findall('./population')
        yearlist = [int(j.attrib['year']) for j in l]
        yearlist.sort()
        year = str(yearlist[-1])
        
        # Use the same to fetch population record
        population = child.find("./population[@year='" + year + "']").text
    except:
        pass
    
    #Get to populating the dataframe
    l = child.findall('./ethnicgroup')
    for item in l:
        df3.loc[i,'country'] = country
        df3.loc[i,'population'] = int(population)
        df3.loc[i,'year'] = int(year)
        df3.loc[i,'ethnicgroup'] = item.text
        df3.loc[i,'pct'] = float(item.attrib['percentage'])
        i += 1

In [56]:
rivers = []
for country in document.findall('airport'):
    rivers.append(country.attrib)

In [57]:
rivers

[{'city': 'cty-Afghanistan-2', 'country': 'AFG', 'iatacode': 'HEA'},
 {'city': 'cty-Afghanistan-Kabul', 'country': 'AFG', 'iatacode': 'KBL'},
 {'city': 'cty-Albania-Tirane', 'country': 'AL', 'iatacode': 'TIA'},
 {'city': 'cty-Algeria-14', 'country': 'DZ', 'iatacode': 'TEE'},
 {'city': 'cty-Algeria-6', 'country': 'DZ', 'iatacode': 'BLJ'},
 {'city': 'cty-Algeria-11', 'country': 'DZ', 'iatacode': 'BJA'},
 {'city': 'cty-Algeria-19', 'country': 'DZ', 'iatacode': 'TMR'},
 {'city': 'cty-Algeria-17', 'country': 'DZ', 'iatacode': 'BSK'},
 {'city': 'cty-Algeria-4', 'country': 'DZ', 'iatacode': 'CZL'},
 {'city': 'cty-Algeria-7', 'country': 'DZ', 'iatacode': 'QSF'},
 {'city': 'cty-Algeria-3', 'country': 'DZ', 'iatacode': 'ORN'},
 {'city': 'cty-Algeria-21', 'country': 'DZ', 'iatacode': 'GHA'},
 {'city': 'cty-Algeria-5', 'country': 'DZ', 'iatacode': 'AAE'},
 {'city': 'cty-Algeria-Algiers', 'country': 'DZ', 'iatacode': 'ALG'},
 {'city': 'cty-Algeria-13', 'country': 'DZ', 'iatacode': 'TLM'},
 {'city':

In [37]:
document.findall('river')

[<Element 'river' at 0x000000000A597818>,
 <Element 'river' at 0x000000000A597BD8>,
 <Element 'river' at 0x000000000A597EF8>,
 <Element 'river' at 0x000000000A59E3B8>,
 <Element 'river' at 0x000000000A59E868>,
 <Element 'river' at 0x000000000A59ED68>,
 <Element 'river' at 0x000000000A5A3278>,
 <Element 'river' at 0x000000000A5A36D8>,
 <Element 'river' at 0x000000000A5A3B38>,
 <Element 'river' at 0x000000000A5A3F48>,
 <Element 'river' at 0x000000000A5AA408>,
 <Element 'river' at 0x000000000A5AA9F8>,
 <Element 'river' at 0x000000000A5AAEA8>,
 <Element 'river' at 0x000000000A5AF3B8>,
 <Element 'river' at 0x000000000A5AF8B8>,
 <Element 'river' at 0x000000000A5AFDB8>,
 <Element 'river' at 0x000000000A5B52C8>,
 <Element 'river' at 0x000000000A5B5868>,
 <Element 'river' at 0x000000000A5B5D18>,
 <Element 'river' at 0x000000000A5BB318>,
 <Element 'river' at 0x000000000A5BB818>,
 <Element 'river' at 0x000000000A5BBD18>,
 <Element 'river' at 0x000000000A5C1318>,
 <Element 'river' at 0x000000000A5