# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [8]:
from xml.etree import ElementTree as ET
import operator 

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './data/mondial_database.xml' )

In [12]:
d={}
for country in document.iterfind('country'):
    if country.find('infant_mortality') is None:
       pass
    else:
        name = country.find('name').text
        im = country.find('infant_mortality').text
        d[name] = float(im)

sorted_dict = sorted(d.items(), key=operator.itemgetter(1))

print sorted_dict[:10]

[('Monaco', 1.81), ('Japan', 2.13), ('Bermuda', 2.48), ('Norway', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15)]


In [24]:
d={}
for country in document.iterfind('country'):
    d1={}
    name = country.find('name').text
    #print name
    for node in country.findall('population'):
        #print node.attrib['year'] 
        d1[node.attrib['year']]=int(node.text)
    
    #print d1
    d1_sorted=sorted(d1.items(), key=operator.itemgetter(0), reverse=True)
    #print d1_sorted
    d[name] = int(d1_sorted[0][1])
    
sorted_dict = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

print sorted_dict[:10]

[('China', 1360720000), ('India', 1210854977), ('United States', 318857056), ('Indonesia', 252124458), ('Brazil', 202768562), ('Pakistan', 173149306), ('Nigeria', 164294516), ('Bangladesh', 149772364), ('Russia', 143666931), ('Japan', 127298000)]


In [47]:
document = ET.parse( './data/mondial_database.xml' )
d = {}
for country in document.iterfind('country'):
    if country.find('./ethnicgroup[1][@percentage]') is None:
        pass
    else:
        lastpop = int(country.find('./population[last()]').text)
        ethnic = country.find('./ethnicgroup[1]')
        ethnicname = ethnic.text
        ethicperc = float(ethnic.get('percentage'))/100
        #print country.find('name').text, ethnicname
        #print lastpop * ethicperc
        d[country.find('name').text, ethnicname] = lastpop * ethicperc
  
sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:10]
print sorted_d
print len(sorted_d)

[(('China', 'Han Chinese'), 1245058800.0), (('India', 'Dravidian'), 302713744.25), (('United States', 'European'), 254958101.97759998), (('Nigeria', 'African'), 162651570.84), (('Bangladesh', 'Bengali'), 146776916.72), (('Japan', 'Japanese'), 126534212.00000001), (('Russia', 'Russian'), 114646210.938), (('Indonesia', 'Javanese'), 113456006.10000001), (('Brazil', 'European'), 108886717.794), (('Vietnam', 'Viet/Kinh'), 76078375.3)]
10


In [49]:
# name and country of a) longest river
d = {}
for river in document.iterfind('river'):
    name = river.get('id')
    #print name
    country = river.get('country')
    #print country
    length = river.find('./length')
    if length is None:
        pass
    else:
        #print float(length.text)
        d[name, country]=float(length.text)
    
sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
print sorted_d

[(('river-Amazonas', 'CO BR PE'), 6448.0)]


In [50]:
# name and country of b) largest lake
d = {}
for lake in document.iterfind('lake'):
    name = lake.get('id')
    #print name
    country = lake.get('country')
    #print country
    area = lake.find('./area')
    if area is None:
        pass
    else:
        #print float(area.text)
        d[name, country]=float(area.text)

sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
print sorted_d


[(('lake-KaspischesMeer', 'R AZ KAZ IR TM'), 386400.0)]


In [51]:
# name and country of c) airport at highest elevation
d = {}
for airport in document.iterfind('airport'):
    name = airport.get('iatacode')
    #print name
    country = airport.get('country')
    #print country
    elevation = airport.findtext('./elevation')
    #print type(elevation)
    if elevation is None:
        pass
    elif elevation=='':
        pass
    else:
        #print float(elevation)
        d[name, country]=float(elevation)

sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
print sorted_d      

[(('LPB', 'BOL'), 4063.0)]
