In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
doc = ET.parse( './data/mondial_database.xml' )
root = doc.getroot()

In [6]:
InfantMortality = []
for baby in root.findall('country'):
    try:
        bd = baby.find('name').text
        size = baby.find('infant_mortality').text
        InfantMortality.append((size, bd))
    except:
        continue
InfantMortality.sort()
print InfantMortality[0:10]

[('1.81', 'Monaco'), ('10.16', 'Romania'), ('10.2', 'Fiji'), ('10.48', 'Brunei'), ('10.5', 'Grenada'), ('10.59', 'Mauritius'), ('10.7', 'Panama'), ('10.77', 'Seychelles'), ('10.92', 'United Arab Emirates'), ('10.93', 'Barbados')]


In [7]:
CityPop = []
for element in root.findall('country'):
    for subelement in element.getiterator('city'):
        city = subelement.find('name').text
        years = {}
        for date in element.iterfind('./city/population'):
            years.update({date.attrib['year']: int(date.text)})
            try:
                latest = years[str(max(map(int,years.keys())))]
                CityPop.append((city, latest))
            except:
                continue
CityPop.sort(key=lambda x: x[1])
print map(lambda x: x[0],CityPop[-10:])

['Mokpo', 'Jeju', 'Gunsan', 'Chuncheon', 'Yeosu', 'Goyang', 'Yongin', 'Bucheon', 'Ansan', 'Anyang']


In [155]:
#10 Largest Ethnic Groups with Large Populations
CountryPop = []
LEG = {}
for i in range(len(root)):
    country = root[i].find('name').text
    year = {}
    for subelement in root[i].iterfind('population'):
        year.update({subelement.attrib['year']: int(subelement.text)})
    try:    
        uptodate = year[str(max(map(int,year.keys())))]
        CountryPop.append((country, uptodate))
        for demo in root[i].iterfind('ethnicgroup'):
            pop = float(demo.attrib['percentage'])*uptodate
            if demo in demo.keys():
                LEG[demo.text] = LEG[demo.text] + pop
            else:
                LEG.update({demo.text: pop})
    except:
        continue
        
print sorted(LEG, key = LEG.get)[-10:]

['Mediterranean Nordic', 'English', 'Viet/Kinh', 'Mulatto', 'Eastern Hamitic', 'Japanese', 'Bengali', 'Dravidian', 'Indo-Aryan', 'Han Chinese']


In [179]:
#longest river
river = []
for i in range(len(root)):
    for subelement in root[i].getiterator('river'):
        try:
            name = subelement.find('name').text
            length = int(subelement.find('length').text)
            country = subelement.attrib['country']
            river.append((country, name, length))
        except:
            continue
#print river 
river.sort(key=lambda x:x[2])
print map(lambda x: x,river[-1:])

[('CO BR PE', 'Amazonas', 6448)]


Highest Airport

In [11]:
#highest airport
air = []
for element in doc.iterfind('airport'):
    for subelement in element.getiterator('airport'):
        name = subelement.findtext('name')
        ele = subelement.findtext('elevation')
        if ele == '':
            ele = 1
        air.append((int(ele), name))
air.sort(reverse=True)
print air[0:1]

[(4063, 'El Alto Intl')]


In [180]:
lakes = []
for i in range(len(root)):
    for subelement in root[i].getiterator('lake'):
        try:
            name = subelement.find('name').text
            country = subelement.attrib['country']
            area = int(subelement.find('area').text)
            lakes.append((country,name,area))
        except:
            continue
            
lakes.sort(key=lambda x: x[2])

print map(lambda x: x[:2],lakes[-1:])

[('R AZ KAZ IR TM', 'Caspian Sea')]
