# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [14]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
document_tree

<xml.etree.ElementTree.ElementTree at 0x419a550>

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text
    

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './data/mondial_database.xml' )

In [4]:
root = document.getroot()

In [31]:
for country in root.findall('country'):
    try:
      print country.find('name').text,float(country.find('infant_mortality').text)
    except AttributeError:
      print('N/A')
   
    

Albania 13.19
Greece 4.78
Macedonia 7.9
Serbia 6.16
Montenegro N/A
Kosovo N/A
Andorra 3.69
France 3.31
Spain 3.33
Austria 4.16
Czech Republic 2.63
Germany 3.46
Hungary 5.09
Italy 3.31
Liechtenstein 4.33
Slovakia 5.35
Slovenia 4.04
Switzerland 3.73
Belarus 3.64
Latvia 7.91
Lithuania 6.0
Poland 6.19
Ukraine 8.1
Russia 7.08
Belgium 4.18
Luxembourg 4.28
Netherlands 3.66
Bosnia and Herzegovina 5.84
Croatia 5.87
Bulgaria 15.08
Romania 10.16
Turkey 21.43
Denmark 4.1
Estonia 6.7
Faroe Islands 5.71
Finland 3.36
Norway 2.48
Sweden 2.6
Monaco 1.81
Gibraltar 6.29
Guernsey 3.47
Holy See N/A
Ceuta N/A
Melilla N/A
Iceland 3.15
Ireland 3.74
San Marino 4.52
Jersey 3.86
Malta 3.59
Isle of Man 4.17
Moldova 12.93
Portugal 4.48
Svalbard N/A
United Kingdom 4.44
Afghanistan 117.23
China 14.79
Iran 39.0
Pakistan 57.48
Tajikistan 35.03
Turkmenistan 38.13
Uzbekistan 19.84
Armenia 13.97
Georgia 16.68
Azerbaijan 26.67
Bahrain 9.68
Bangladesh 45.67
Myanmar 44.91
India 43.19
Bhutan 37.89
Brunei 10.48
Malaysia 13.69

In [169]:
dictionary = dict()
for country in root.findall('country'):
    try:
      dictionary[country.find('name').text] = float(country.find('infant_mortality').text)
    except AttributeError:
      dictionary[country.find('name').text] = float("inf")
dictionary['Oman']

14.0

In [170]:
# Exercise 1
t = sorted(dictionary.items(), key = lambda x: x[1])
print t[0:11]


[('Monaco', 1.81), ('Japan', 2.13), ('Bermuda', 2.48), ('Norway', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15), ('France', 3.31)]


In [207]:
# Exercise 2
dict2 = dict()
for city in root.findall('./country/city'):
    try:
      dict2[city.find('name').text] = int(city.find('./population[last()]').text)
    except AttributeError:
      dict2[city.find('name').text] = float("-inf")
s = sorted(dict2.items(), key = lambda x: x[1])
print s[-10:]

[('Pyongyang', 3255288), ('Busan', 3403135), ('New Taipei', 3939305), ('Al Iskandariyah', 4123869), ('Singapore', 5076700), ('Ho Chi Minh', 5968384), ('Hong Kong', 7055071), ('Bangkok', 7506700), ('Al Qahirah', 8471859), ('Seoul', 9708483)]


# Exercise 3
dict3 = dict()
for element in document.iterfind('country'):
    try:
      print element.find('ethnicgroup').text, int(float(element.find('ethnicgroup').attrib['percentage'])*int(element.find('./population[last()]').text)/100)
    except AttributeError:
      print('n/a')

In [210]:
dict4 = dict()
list1 = list(range(0,244))
for idx, element in enumerate(document.iterfind('country')):
      try:
         print idx, element.find('ethnicgroup').text, int(float(element.find('ethnicgroup').attrib['percentage'])*int(element.find('./population[last()]').text)/100)
         list1[idx] = (element.find('ethnicgroup').text, int(float(element.find('ethnicgroup').attrib['percentage'])*int(element.find('./population[last()]').text)/100))       
      except AttributeError:
         list1[idx] = ('n/a',0)    
         continue
list1


0 Albanian 2660131
1 Greek 10059145
2 Macedonian 1322387
3 Serb 5903032
4 Montenegrin 266612
5 Albanian 1595162
6 Spanish 33589
7 8 Mediterranean Nordic 46815916
9 Austrian 7743280
10 Czech 9548241
11 German 73401020
12 Hungarian 9172430
13 14 Italian 1831
15 Slovak 4625259
16 Slovene 1873527
17 German 5290760
18 Belorussian 7682081
19 Latvian 1305309
20 Lithuanian 2502620
21 German 500939
22 Ukrainian 35502969
23 Russian 114646210
24 Fleming 6437741
25 Luxembourgish 331182
26 Dutch 13592447
27 Muslim 1819978
28 Croat 3844388
29 Bulgarian 5601820
30 Romanian 17928382
31 Turkish 63935390
32 33 Estonian 889290
34 Scandinavian 48197
35 Finn 5095033
36 Norwegian 4167301
37 Swede 7931391
38 French 17317
39 40 Norman-French 59807
41 42 43 44 Celt 318452
45 Irish 4010132
46 47 Norman-French 97857
48 49 50 Moldavian/Romanian 2783561
51 52 Norwegian 1037
53 English 53592326
54 Tajik 6505775
55 Han Chinese 1245058800
56 Arab 2254490
57 58 Tajik 6520718
59 Turkmen 4285695
60 Uzbek 22215416
61 Arm

[('Albanian', 2660131),
 ('Greek', 10059145),
 ('Macedonian', 1322387),
 ('Serb', 5903032),
 ('Montenegrin', 266612),
 ('Albanian', 1595162),
 ('Spanish', 33589),
 ('n/a', 0),
 ('Mediterranean Nordic', 46815916),
 ('Austrian', 7743280),
 ('Czech', 9548241),
 ('German', 73401020),
 ('Hungarian', 9172430),
 ('n/a', 0),
 ('Italian', 1831),
 ('Slovak', 4625259),
 ('Slovene', 1873527),
 ('German', 5290760),
 ('Belorussian', 7682081),
 ('Latvian', 1305309),
 ('Lithuanian', 2502620),
 ('German', 500939),
 ('Ukrainian', 35502969),
 ('Russian', 114646210),
 ('Fleming', 6437741),
 ('Luxembourgish', 331182),
 ('Dutch', 13592447),
 ('Muslim', 1819978),
 ('Croat', 3844388),
 ('Bulgarian', 5601820),
 ('Romanian', 17928382),
 ('Turkish', 63935390),
 ('n/a', 0),
 ('Estonian', 889290),
 ('Scandinavian', 48197),
 ('Finn', 5095033),
 ('Norwegian', 4167301),
 ('Swede', 7931391),
 ('French', 17317),
 ('n/a', 0),
 ('Norman-French', 59807),
 ('n/a', 0),
 ('n/a', 0),
 ('n/a', 0),
 ('Celt', 318452),
 ('Irish',

In [211]:
d = dict()
for x, y in list1:
    d.setdefault(x, []).append(y)
print d 


{'Estonian': [889290], 'Hindustani': [200406], 'Austrian': [7743280], 'Kyrgyz': [3749013], 'European': [836670, 11112, 254958101, 7269928, 21284458, 2409483, 41389415, 108886717, 2891956, 370628, 243833, 143511, 49317, 4918, 15866, 14220, 35794, 187356], 'Bulgarian': [5601820], 'Azeri': [8476989], 'Berber Arab': [5859393], 'Tigrinya': [3157637], 'Hutu': [8412778], 'Muslim': [1819978], 'French': [17317, 26827, 9451], 'Belorussian': [7682081], 'Bengali': [146776916], 'European/Caribbean Amerindian': [81187], 'Bantu': [42682476], 'Albanian': [2660131, 1595162], 'Italian': [1831, 41701], 'Chamorro': [59121], 'German': [73401020, 5290760, 500939], 'Tajik': [6505775, 6520718], 'Finn': [5095033], 'Basques Bretons': [6312], 'Mossi': [4157471], 'Europeans': [751], 'n/a': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Mande': [6992980], 'Maur': [1061210], 'South Asian': [4220768], 'White': [2988082],

In [214]:
z = {k:sum(v) for k,v in d.items()}
print z

{'Dutch': 13592447, 'Estonian': 889290, 'Hindustani': 200406, 'Austrian': 7743280, 'Kyrgyz': 3749013, 'European': 441003283, 'Bulgarian': 5601820, 'Azeri': 8476989, 'Berber Arab': 5859393, 'Tigrinya': 3157637, 'Hutu': 8412778, 'Muslim': 1819978, 'Fula': 1456790, 'Belorussian': 7682081, 'Bengali': 146776916, 'European/Caribbean Amerindian': 81187, 'Bantu': 42682476, 'Albanian': 4255293, 'Italian': 43532, 'Chamorro': 59121, 'German': 79192719, 'Slovak': 4625259, 'Finn': 5095033, 'Black': 76887, 'Mossi': 4157471, 'English': 53592326, 'Europeans': 751, 'n/a': 0, 'Mande': 6992980, 'Maur': 1061210, 'South Asian': 4220768, 'White': 2988082, 'Danish': 7891, 'Macedonian': 1322387, 'Georgian': 3757424, 'Han Chinese': 1245058800, 'Norwegian': 4168338, 'Sinhalese': 14995155, 'Tajik': 13026493, 'Turkmen': 4285695, 'Croat': 3844388, 'Slovene': 1873527, 'Uzbek': 22215416, 'Scandinavian': 48197, 'Amerindian': 14079905, 'Greek': 10706258, 'Latvian': 1305309, 'Dravidian': 302713744, 'Mediterranean Nordi

In [217]:
# Excercise 3
z2 = sorted(z.items(), key = lambda x: x[1])
print z2[-10:]

[('German', 79192719), ('Javanese', 113456006), ('Russian', 114646210), ('Japanese', 126534212), ('Mestizo', 141972914), ('Bengali', 146776916), ('African', 198605031), ('Dravidian', 302713744), ('European', 441003283), ('Han Chinese', 1245058800)]


In [234]:
# Exercise 4
dict4 = dict()
for idx, element in enumerate(document.iterfind('lake')):
        try: 
           dict4[element.find('name').text + ", " + element.find('located').attrib['country'] ] = float(element.find('area').text)
        except AttributeError:
           dict4[element.find('name').text] = 0
        
t3 = sorted(dict4.items(), key = lambda x: x[1])
print t3[-1:]

[('Caspian Sea, R', 386400.0)]


In [235]:
# Exercise 4
dict5 = dict()
for idx, element in enumerate(document.iterfind('river')):
        try: 
           dict5[element.find('name').text + ", " + element.find('located').attrib['country'] ] = float(element.find('length').text)
        except AttributeError:
           dict5[element.find('name').text] = 0
        
t4 = sorted(dict5.items(), key = lambda x: x[1])
print t4[-1:]

[('Amazonas, CO', 6448.0)]


In [249]:
# Exercise 4
dict6 = dict()
for idx, element in enumerate(document.iterfind('airport')):
        try: 
           dict6[element.find('name').text + ", " + element.find('.').attrib['country'] ] = float(element.find('elevation').text)
        except TypeError:
           dict6[element.find('name').text] = 0
        except AttributeError:
           dict6[element.find('name').text] = 0
        
t5 = sorted(dict6.items(), key = lambda x: x[1])
print t5[-1]

('El Alto Intl, BOL', 4063.0)


In [250]:
pwd

u'C:\\Users\\jason\\Desktop\\slide_rule\\data_wrangling_xml'