In [1]:
from xml.etree import ElementTree as ET
import numpy
import pandas as pd

In [2]:
document_tree = ET.parse('./data/mondial_database.xml')

#### * 10 countries with the lowest infant mortality rates

In [3]:
#10 countries with the lowest infant mortality rates

#create a dictionary of infant mortality and country
infant_mortality = {}
for child in document_tree.getroot():
    try:
        #'infant mortality' rate is key and country name is value
        infant_mortality[float(child.find('infant_mortality').text)] = child.find('name').text
    except AttributeError:   #if 'infant_mortality' is not present this error is thrown
        pass
    
#Convert to dataframe        
im_df = pd.DataFrame.from_dict(infant_mortality,orient = 'index')
im_df = im_df.rename(columns={0:'country'})
im_df.index.rename('rate',inplace=True)
im_df = im_df.sort_index()
im_df.head(10)

Unnamed: 0_level_0,country
rate,Unnamed: 1_level_1
1.81,Monaco
2.13,Japan
2.48,Bermuda
2.53,Singapore
2.6,Sweden
2.63,Czech Republic
2.73,Hong Kong
3.13,Macao
3.15,Iceland
3.31,Italy


#### *10 cities with the largest population

In [4]:
#10 cities with the largest population

#create a dictionary of population and country
population = {}
for elt in document_tree.getiterator(tag='city'):  #iterate though tree to find 'city' element
    try:
        #population is key and city name is value
        #use index [-1] to pick latest population
        population[int(elt.findall('population')[-1].text)] = elt.find('name').text
    except AttributeError:
        pass
    except IndexError: #if population is not present this error is thrown
        pass
    
#Convert to dataframe        
im_df = pd.DataFrame.from_dict(population,orient = 'index')
im_df = im_df.rename(columns={0:'country'})
im_df.index.rename('population',inplace=True)
im_df = im_df.sort_index(ascending=False)
im_df.head(10)

Unnamed: 0_level_0,country
population,Unnamed: 1_level_1
22315474,Shanghai
13710512,Istanbul
12442373,Mumbai
11979529,Moskva
11716620,Beijing
11152344,São Paulo
11090314,Tianjin
11071424,Guangzhou
11034555,Delhi
10358381,Shenzhen


#### *10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [5]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
#ethnic_group_population = ethnic_group_percentage * total_population_of_that_country

ethnic_groups = {}
for child in document_tree.getiterator('country'):
    try:
        temp_pop = int(child.findall('population')[-1].text) #current country population
    except AttributeError:
        pass
    except IndexError: #if population is not present this error is thrown
        pass
    
    for elt in child.getiterator(tag='ethnicgroup'):  #iterate though country to find 'ethnicgroup' element
        if elt.text in ethnic_groups:
            ethnic_groups[elt.text] += int((float(elt.attrib['percentage'])/100.0) * temp_pop)
        else:
            ethnic_groups[elt.text] = int((float(elt.attrib['percentage'])/100.0) * temp_pop)
    
#Convert to dataframe        
im_df = pd.DataFrame.from_dict(ethnic_groups,orient = 'index')
im_df = im_df.rename(columns={0:'population'})
im_df.index.rename('ethnicity',inplace=True)
im_df = im_df.sort_values(by='population',ascending=False)
im_df.head(10)

Unnamed: 0_level_0,population
ethnicity,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


#### *name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
#name and country of a) longest river, b) largest lake and c) airport at highest elevation

import collections
#collections can be used to sort countries dict to fetch from it in less time
countries = {}
for child in document_tree.getiterator('country'):
    try:
        countries[child.attrib['car_code']] = child.find('name').text #current country
    except AttributeError:
        pass
countries = collections.OrderedDict(sorted(countries.items()))
#print(countries)

#longest river
longest_river = {}
temp_len = 0
for elt in document_tree.getiterator('river'):
    try:
        if float(elt.find('length').text) > temp_len:
            temp_len = float(elt.find('length').text)
            temp_coun = elt.attrib['country']
            temp_name = elt.find('name').text
    except AttributeError:
        pass
longest_river[temp_name] = countries[temp_coun.split(' ')[0]]
print("{'Longest river':'Country'}->",longest_river)

#largest lake
largest_lake = {}
temp_volume = 0
max_volume = 0
for elt in document_tree.getiterator('lake'):
    try:
        temp_volume = float(elt.find('area').text) * float(elt.find('depth').text)  #to find largest lake = area*depth
        if temp_volume > max_volume:
            max_volume = temp_volume
            temp_coun = elt.attrib['country']
            temp_name = elt.find('name').text
    except AttributeError:
        pass
largest_lake[temp_name] = countries[temp_coun.split(' ')[0]]
print("{'Largest Lake':'Country'}->",largest_lake)

#highest airport
highest_airport = {}
temp_high = 0
for elt in document_tree.getiterator('airport'):
    try:
        if float(elt.find('elevation').text) > temp_high:
            temp_high = float(elt.find('elevation').text)
            temp_coun = elt.attrib['country']
            temp_name = elt.find('name').text
    except AttributeError:
        pass
    except TypeError:
        pass
highest_airport[temp_name] = countries[temp_coun.split(' ')[0]]
print("{'Highest Airport':'Country'}->",highest_airport)

{'Longest river':'Country'}-> {'Amazonas': 'Colombia'}
{'Largest Lake':'Country'}-> {'Caspian Sea': 'Russia'}
{'Highest Airport':'Country'}-> {'El Alto Intl': 'Bolivia'}
