# This exercise explores the Mondial database (data source: http://www.dbis.informatik.uni-goettingen.de/Mondial) as an XML document

### There are four objectives to this exercise
1. Identify the 10 countries with the lowest infant mortality rates
2. Identify the 10 cities with the largest population
3. Identify the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. The name and country of a) the longest river, b) the largest lake and c) the airport at highest elevation


## Imports for working with xml documents

In [1]:
import pandas as pd
from lxml import etree as ET
import numpy as np

In [2]:
# use the etree library to parse the xml document into an element tree
document = ET.parse( './data/mondial_database.xml' )

# identify the root element in the xml document tree
root = document.getroot()

## The first three objectives require information that is assocated with the Country element. Therefore, I can loop through all of the country elements just once to answer questions 1-3.

In [30]:
ethnic_dict = {}
infant_mortalitys = []
city_population = []

# loops through the all of the country elements
for elem in document.iterfind('country'): # Identfies the country's name and it's latest population 
    country_name = elem.find('name').text
    country_population = float(elem.findall('population')[-1].text) # extracts the most recent population numbers
    # loops through the each country's element and extracts the infant_mortality rate
    for infant_mortality in elem.getiterator('infant_mortality'): # Identfies the country's infant mortality
        mortality = infant_mortality.text 
        infant_mortalitys.append((country_name, float(mortality))) #Appends the country's name and infant mortality to a list
    # loops through the each country's element and extracts the city name and most recent poplulation
    for city in elem.getiterator('city'): #Identifies the city's population
        city_name = city.find('name').text # extracts city name
        if city.find("population") is None:
            city_pop = 0
        else:
            city_pop = float(city.findall("population")[-1].text) # extracts most recent population
        city_population.append((city_name, city_pop)) #Appends the city name and city population to a list

    # loops through the each country's element and extracts ethnicity percentage
    # uses country population to generate the number of people in each ethnicity
    for ethnicity in elem.getiterator('ethnicgroup'): #Identifies the ethnic population in each country and sums then
        ethnic_group = ethnicity.text
        ethnic_group_percent = float(ethnicity.attrib['percentage'])
        ethnic_group_value = country_population * ethnic_group_percent
        if ethnic_group in ethnic_dict: # creates a dictionary with key= ethnic group, value = ethnic population
            ethnic_dict[ethnic_group] = ethnic_dict[ethnic_group] + ethnic_group_value # sums the ethnic population of muliple countries
        else:
            ethnic_dict[ethnic_group] = ethnic_group_value # adds ethnich group population to dictionary, if it doesn't currently exist
    

    

#### * 1. The 10 countries with the lowest infant mortality *

In [4]:
#loops through the list of city populations and print's the 10 largest cities names
for x,y in sorted(infant_mortalitys, key=lambda x: x[1])[0:10]:
    print(x)

Monaco
Japan
Norway
Bermuda
Singapore
Sweden
Czech Republic
Hong Kong
Macao
Iceland


#### * 2. Identifies the 10 cities with the largest population*

In [5]:
#loops through the list of city populations and print's the 10 largest cities names
for x,y in sorted(city_population, key=lambda x: x[1])[0:10]:
    print(x)

Komotini
Kozani
Ermoupoli
Tripoli
Mytilini
Chalons en Champagne
Toledo
Merida
Santiago de Compostela
Bregenz


#### * 3. Identifies the 10 ethnic groups with the largest overall populations*

In [6]:
#loops through the dictionary of ethnicities and their populations and print's the 10 largest ethnicities
for x,y in sorted(ethnic_dict.items(), key=lambda x:x[1], reverse = True)[0:10] :
    print(x)
       

Han Chinese
Indo-Aryan
European
African
Dravidian
Mestizo
Bengali
Russian
Japanese
Malay


#### * 4. Identifiy the name and country of a) longest river, b) largest lake and c) airport at highest elevation* 

#### * a) The name and country of the longest river *

#### My strategy for this question, was to first identify the longest river and the corresponding name, and country code

In [55]:
longest_river = 0
country_name = None
river_name = None
# finds longest river, river name, and country codes
for river in root.findall('river'): #loops through all river elements
    if river.find('length') is None: # skips over empty elements
        continue
    
    country_code = river.attrib['country'] # retrieves country code
    R_name = river.find("name").text # retrieves river name
    R_length = float(river.find("length").text) # retrieves river length, and converts it to float
    # stores largest length and corresponding river name and country code in variables
    if R_length > longest_river: 
        longest_river = R_length
        country_name = country_code
        river_name = R_name     

# prints resulting variables   
print(longest_river, river_name, country_name)  

6448.0 Amazonas CO BR PE


#### Since the river goes through multiple countries, I split the codes up into a list, and retrieved the names for all three countries

In [58]:
expr = '//country[@car_code= $code]'
name_lst = []
for c_code in country_name.split():
    name_lst.append(root.xpath(expr, code = c_code)[0].find('name').text)
print(name_lst)

['Colombia', 'Brazil', 'Peru']


In [64]:
print( river_name + ' is the longest river and it flows through '  + ', '.join(name_lst) )

Amazonas is the longest river and it flows through Colombia, Brazil, Peru


#### I then realized that questions b and c could be solved in the same way, so I wrote a function that takes the kind of geographical thing you are looking for ( lake or airport) and the correponding scalar metric (area, elevation), and a function to convert country code(s) to name(s)

In [96]:
# general function(s)
def x_of_kind(kind, scalar_type):
    scalar_value = 0
    country_name = None
    kind_name = None
    # finds scalar_type, x name, and country codes
    for x in root.findall(kind): #loops through all kind elements
        if x.find(scalar_type) is None: # skips over empty elements
            continue
        if x.find(scalar_type).text is None: # skips over empty elements
            continue

        country_code = x.attrib['country'] # retrieves country code
        name = x.find("name").text # retrieves name
        kind_scalar = float(x.find(scalar_type).text) # retrieves scalar metric
        # stores largest scalar value and corresponding name and country codes
        if kind_scalar > scalar_value: 
            scalar_value = kind_scalar
            country_name = country_code
            kind_name = name
    return (scalar_value, kind_name, country_name)

def code_to_contry(string):
    expr = '//country[@car_code= $code]'
    name_lst = []
    for c_code in string.split():
        name_lst.append(root.xpath(expr, code = c_code)[0].find('name').text)
    return name_lst

#### * b) The name and country of the largest lake *

In [102]:
answer_B = x_of_kind('lake', "area")
names = code_to_contry(answer_B[2])
print("The largest lake is the "+ answer_B[1] + ", which is in "+ ', '.join(names))

The largest lake is the Caspian Sea, which is in Russia, Azerbaijan, Kazakhstan, Iran, Turkmenistan


#### * c) The name and country of the airport at the highest elevation *

In [101]:
answer_C = x_of_kind('airport', "elevation")
names = code_to_contry(answer_C[2])
print("The highest airport is "+ answer_C[1] + ", which is in "+ ', '.join(names))

The highest airport is El Alto Intl, which is in Bolivia
