# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [4]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [5]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [6]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë, 
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes, 
* Macedonia: Skopje, Kumanovo, 
* Serbia: Beograd, Novi Sad, Niš, 
* Montenegro: Podgorica, 
* Kosovo: Prishtine, 
* Andorra: Andorra la Vella, 


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
document = ET.parse( './data/mondial_database.xml' )

In [8]:
document

<xml.etree.ElementTree.ElementTree at 0x1061b3ed0>

In [23]:
for element in document.iterfind('country'):
    print float(element.find('infant_mortality').text)

13.19
4.78
7.9
6.16


AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
#Not all countries have infant mortality rate listed.. 

In [None]:
mort={}
for element in document.iterfind('country'):
    for subelem in element.iterfind('infant_mortality'):    
        mort[element.find('name').text] = float(subelem.text) #find finds the first child with a particular tag,
        

In [None]:
inf_mort1 = []
for k,v in mort.items():
    inf_mort1.append([v,k])

In [None]:
inf_mort1.sort()

In [None]:
inf_mort1[0:10]

In [None]:
#using pandas
inf_mort2 = pd.DataFrame.from_dict(mort,orient = 'index')
inf_mort2.columns = ['infant mortality']
inf_mort2.sort('infant mortality').head(10)



In [10]:
#10 cities with the largest population
current_year= 0
current_pop = 0
city_pop ={}
for country in document.iter('country'):
    #print country.attrib
    for city in country.iter('city'):    
        for pop in city.iter('population'):
            if( int(pop.attrib['year'])   > current_year): #attrib is dictionary. iterator returns attrib which is a dictionary
                current_year = int(pop.attrib['year'])
                current_pop = int(pop.text)
        city_pop[city.find('name').text] = current_pop
        current_year = 0
        current_pop = 0


In [11]:
city_pop_dF = pd.DataFrame.from_dict(city_pop, orient ='index')

In [12]:
city_pop_dF.columns = ['Population']
city_pop_dF.sort('Population',ascending=False).head(10)

Unnamed: 0,Population
Shanghai,22315474
Istanbul,13710512
Mumbai,12442373
Moskva,11979529
Beijing,11716620
São Paulo,11152344
Tianjin,11090314
Guangzhou,11071424
Delhi,11034555
Shenzhen,10358381


In [1]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [16]:
#10 cities with the largest population
current_year= 0
current_pop = 0
ethn_pop ={}
for country in document.iter('country'):
    #print country.attrib
    for pop in country.iter('population'):
        if( int(pop.attrib['year'])   > current_year): #attrib is dictionary. iterator returns attrib which is a dictionary
                current_year = int(pop.attrib['year'])
                current_pop = int(pop.text)

    for ethn in country.iter('ethnicgroup'):
        if ethn.text not in ethn_pop:
            ethn_pop[ethn.text] = float(ethn.attrib['percentage'])*current_pop/100
        else:
            ethn_pop[ethn.text] += float(ethn.attrib['percentage'])*current_pop/100
    current_year = 0
    current_pop = 0
    

In [20]:
ethn_pop_df = pd.DataFrame.from_dict(ethn_pop, orient = 'index')
ethn_pop_df.columns = ['Population']
ethn_pop_df.sort('Population',ascending=False).head(10)

Unnamed: 0,Population
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,130484000.0
Japanese,126534200.0
Malay,121993600.0


In [21]:
#name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [35]:
current_year= 0
river_length = 0
ethn_pop ={}
for riv in document.iter('river'):
    rl = float(riv.find('length').text)
    print rl
    
    
    
    
    

230.0
206.0
604.0
322.0
93.0
460.0
470.0
520.0
320.0
241.0
145.0
300.0
550.0
107.0
203.0
121.0
162.0
346.0
925.0
1013.0
647.0
812.0
480.0
453.0
290.0
776.0
514.0
1007.0
897.0
742.0
657.0
925.0
652.0
248.0
313.0
75.0
415.0
405.0
240.0
2845.0
45.9
43.0
147.0
264.0
295.0
168.0
35.0
517.0
150.0
225.0
254.0
358.0
250.0
403.0
749.0
453.0
1308.0
945.0
346.0
140.0
120.0
185.0
308.0
295.0
615.0
953.0
1352.0
440.0
211.0
281.0
292.0
221.0
1091.0
440.0
1324.0
524.0
544.0
227.0
367.0
288.0
164.0
36.3
866.0
1047.0
448.0
772.0
99.5
44.0
152.0
175.0
133.0
251.0
1364.0
193.0
1900.0
2736.0
720.0
720.0
950.0
740.0
558.0
1809.0
74.0
224.0
78.0
1020.0
1352.0
2201.0
775.0
1870.0
3531.0
1480.0
1805.0
2428.0
3650.0
4248.0
1591.0
2450.0
688.0
4092.0
1779.0
992.0
1636.0
4400.0
2824.0
560.0
1620.0
2129.0
2129.0
807.0
1415.0
1141.0
133.0
395.0
1001.0
3260.0
4845.0
6380.0
4350.0
2980.0
2170.0
2511.0
2896.0
3180.0
150.0
3185.0
1903.0
664.0
1392.0
1197.0
58.0
200.0
120.0
0.1
560.0
124.0
177.0
655.0
493.0
3778.0
4130

AttributeError: 'NoneType' object has no attribute 'text'

In [40]:
current_year= 0
river_length = 0
ethn_pop ={}
for riv in document.iter('river'):
    rl = float(riv.find('length').text)
    if(rl > river_length):
        country_name  = riv.attrib['country']
        river_name = riv.find('name').text
    river_length = rl  

print country_name    

AttributeError: 'NoneType' object has no attribute 'text'

In [51]:

#Not all the entries have an river length So loop  for the element named 'length'.

river_length = 0

for river in document.iter('river'):
    for length in river.iter('length'):
        if(float(length.text) > river_length):
            country_name  = river.attrib['country']
            river_name    = river.find('name').text
            river_length = float(length.text)

print country_name,river_name,river_length    

CO BR PE Amazonas 6448.0


In [53]:
lake_area = 0
for lake in document.iterfind('lake'):
    for area in lake.iterfind('area'):
        if float(area.text) > lake_area:
            lake_area=float(area.text)
            country_name= lake.attrib['country']
            lake_name = lake.find('name').text
print country_name,lake_name,lake_area               

R AZ KAZ IR TM Caspian Sea 386400.0


In [61]:
airport_elev = 0
for airport in document.iterfind('airport'):
    for elev in airport.iterfind('elevation'):
        if (elev.text is not None ) and float(elev.text) > airport_elev:
            airport_elev=float(elev.text)
            country_name= airport.attrib['country']
            airport_name = airport.find('name').text
            
print country_name,airport_name,airport_elev                  

BOL El Alto Intl 4063.0
