In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests
from functools import reduce
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

%matplotlib inline

### Solo Exploration and Presentation:
#### 1. Choose and download another data set from the UN data to explore.
[http://data.un.org/Explorer.aspx](http://data.un.org/Explorer.aspx)   
You may want to combine your new dataset with one or both of the datasets that you already worked with. Prepare a short (< 5 minute) presentation of your findings. Report any interesting correlations or trends that you find. 
#### 2. If time allows, check out the plotly library to add additional interativity to your plots. 
[https://plotly.com/python/plotly-express/](https://plotly.com/python/plotly-express/)

#### Primary Education Completion Rates

In [2]:
endpoint1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/Ief9e0d38__Sc5211ad2_1644293535310441_tbl/FeatureServer/0/query?where=income_wealth_quantile%20%3D%20\'_T\'%20AND%20urbanization%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=time_period,sex,sex_desc,education_lev_desc,urbanization_desc,income_wealth_quantile_desc,obs_value,unit_measure_desc,indicator_desc,ref_area_desc&returnGeometry=false&outSR=&f=json'

endpoint2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/Ief9e0d38__Sc5211ad2_1644293535310441_tbl/FeatureServer/0/query?where=income_wealth_quantile%20%3D%20\'_T\'%20AND%20urbanization%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=time_period,sex,sex_desc,education_lev_desc,urbanization_desc,income_wealth_quantile_desc,obs_value,unit_measure_desc,indicator_desc,ref_area_desc&returnGeometry=false&resultOffset=1000&outSR=&f=json'

In [3]:
pe_res1 = requests.get(endpoint1).json()
pe_res2 = requests.get(endpoint2).json()

In [4]:
pe_df1 = pd.DataFrame(pd.DataFrame(pe_res1['features'])['attributes'].values.tolist())
pe_df2 = pd.DataFrame(pd.DataFrame(pe_res2['features'])['attributes'].values.tolist())

primary_ed = pd.concat([pe_df1, pe_df2])
primary_ed = primary_ed[['ref_area_desc', 'time_period', 'sex', 'obs_value']]
primary_ed.columns = ['Country', 'Year', 'Sex', 'Primary_ed_completion_percent']
primary_ed = primary_ed.sort_values(['Country', 'Year', 'Sex'])
primary_ed

Unnamed: 0,Country,Year,Sex,Primary_ed_completion_percent
625,Afghanistan,2011,F,26.63788
228,Afghanistan,2011,M,53.74893
26,Afghanistan,2011,_T,40.72569
577,Afghanistan,2015,F,40.35416
229,Afghanistan,2015,M,67.30569
...,...,...,...,...
834,Zimbabwe,2015,M,87.02327
623,Zimbabwe,2015,_T,88.21310
227,Zimbabwe,2019,F,91.62107
835,Zimbabwe,2019,M,86.35518


In [5]:
pd.DataFrame([dict['attributes'] for dict in pe_res1['features']])

Unnamed: 0,time_period,sex,sex_desc,education_lev_desc,urbanization_desc,income_wealth_quantile_desc,obs_value,unit_measure_desc,indicator_desc,ref_area_desc
0,2007,_T,Both sexes or no breakdown by sex,Primary education,All areas,Total (national average) or no breakdown,93.74000,Percent,Completion rate (primary education),Brazil
1,2004,_T,Both sexes or no breakdown by sex,Primary education,All areas,Total (national average) or no breakdown,87.69000,Percent,Completion rate (primary education),Bolivia (Plurinational State of)
2,2005,_T,Both sexes or no breakdown by sex,Primary education,All areas,Total (national average) or no breakdown,89.05000,Percent,Completion rate (primary education),Bolivia (Plurinational State of)
3,2006,_T,Both sexes or no breakdown by sex,Primary education,All areas,Total (national average) or no breakdown,87.50000,Percent,Completion rate (primary education),Bolivia (Plurinational State of)
4,2007,_T,Both sexes or no breakdown by sex,Primary education,All areas,Total (national average) or no breakdown,91.25000,Percent,Completion rate (primary education),Bolivia (Plurinational State of)
...,...,...,...,...,...,...,...,...,...,...
995,2016,F,Female,Primary education,All areas,Total (national average) or no breakdown,84.48334,Percent,Completion rate (primary education),Myanmar
996,2007,F,Female,Primary education,All areas,Total (national average) or no breakdown,84.73734,Percent,Completion rate (primary education),Namibia
997,2013,F,Female,Primary education,All areas,Total (national average) or no breakdown,87.89175,Percent,Completion rate (primary education),Namibia
998,2015,F,Female,Primary education,All areas,Total (national average) or no breakdown,49.27854,Percent,Completion rate (primary education),Mauritania


Josh's suggestions:
pd.DataFrame([dict['attributes'] for dict in pe_res1['features']])

did you try that pd.DataFrame.from_features(myjson) trick

Another powerful tool for working with these: pd.DataFrame.json_normalize()

#### Secondary Education Completion

In [6]:
se1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&outSR=&f=json'
se2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&resultOffset=1000&outSR=&f=json'
se3 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&resultOffset=2000&outSR=&f=json'

se_res1 = requests.get(se1).json()
se_res2 = requests.get(se2).json()
se_res3 = requests.get(se3).json()

In [7]:
se_df1 = pd.DataFrame(pd.DataFrame(se_res1['features'])['attributes'].values.tolist())
se_df2 = pd.DataFrame(pd.DataFrame(se_res2['features'])['attributes'].values.tolist())
se_df3 = pd.DataFrame(pd.DataFrame(se_res3['features'])['attributes'].values.tolist())

secondary_ed = pd.concat([se_df1, se_df2, se_df3])
secondary_ed = secondary_ed[['ref_area_desc', 'time_period', 'sex', 'obs_value']]
secondary_ed.columns = ['Country', 'Year', 'Sex', 'Secondary_ed_completion_percent']
secondary_ed = secondary_ed.sort_values(['Country', 'Year', 'Sex'])

secondary_ed

Unnamed: 0,Country,Year,Sex,Secondary_ed_completion_percent
768,Afghanistan,2011,F,11.50881
512,Afghanistan,2011,M,34.65255
19,Afghanistan,2011,_T,23.37700
769,Afghanistan,2015,F,25.17210
464,Afghanistan,2015,M,48.71595
...,...,...,...,...
164,Zimbabwe,2015,M,72.22249
766,Zimbabwe,2015,_T,72.62408
511,Zimbabwe,2019,F,54.36128
165,Zimbabwe,2019,M,53.50781


#### Maternal Death Rate per 100,000 live births

In [8]:
mdr1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&f=json'
mdr2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=1000&f=json'
mdr3 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=2000&f=json'
mdr4 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=3000&f=json'

In [9]:
mdr_res1 = requests.get(mdr1).json()
mdr_res2 = requests.get(mdr2).json()
mdr_res3 = requests.get(mdr3).json()
mdr_res4 = requests.get(mdr4).json()

In [10]:
type(mdr_res1['features'])

list

In [11]:
mdr_df1 = pd.DataFrame(pd.DataFrame(mdr_res1['features'])['attributes'].values.tolist())
mdr_df2 = pd.DataFrame(pd.DataFrame(mdr_res2['features'])['attributes'].values.tolist())
mdr_df3 = pd.DataFrame(pd.DataFrame(mdr_res3['features'])['attributes'].values.tolist())
mdr_df4 = pd.DataFrame(pd.DataFrame(mdr_res4['features'])['attributes'].values.tolist())

mdr = pd.concat([mdr_df1, mdr_df2, mdr_df3, mdr_df4])
mdr =(
    mdr.drop(columns = ['indicator_desc', 'unit_measure_desc'])
    .rename(columns = {'ref_area_desc' : 'Country',
                      'time_period' : 'Year',
                      'obs_value' : 'Maternal_Death_Rate'})
)

mdr = mdr.sort_values(['Country', 'Year'])

mdr

Unnamed: 0,Country,Year,Maternal_Death_Rate
184,Afghanistan,2000,1450
185,Afghanistan,2001,1390
186,Afghanistan,2002,1300
187,Afghanistan,2003,1240
188,Afghanistan,2004,1180
...,...,...,...
465,Zimbabwe,2013,509
466,Zimbabwe,2014,494
467,Zimbabwe,2015,480
468,Zimbabwe,2016,468


#### Merging the two education tables

In [12]:
education = pd.merge(primary_ed, secondary_ed, how='outer', on=['Country', 'Year', 'Sex'])

education.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2193 entries, 0 to 2192
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2193 non-null   object 
 1   Year                             2193 non-null   int64  
 2   Sex                              2193 non-null   object 
 3   Primary_ed_completion_percent    1836 non-null   float64
 4   Secondary_ed_completion_percent  2193 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 102.8+ KB


In [13]:
#Looking at rows that don't have primary education rates listed
(
    education.loc[(education['Primary_ed_completion_percent']
                   .isna())]
)

Unnamed: 0,Country,Year,Sex,Primary_ed_completion_percent,Secondary_ed_completion_percent
1836,Australia,2010,F,,99.53
1837,Australia,2010,M,,98.82
1838,Australia,2010,_T,,99.15
1839,Austria,2005,F,,98.59
1840,Austria,2005,M,,98.73
...,...,...,...,...,...
2188,United Kingdom of Great Britain and Northern I...,2013,M,,100.00
2189,United Kingdom of Great Britain and Northern I...,2013,_T,,100.00
2190,United Kingdom of Great Britain and Northern I...,2014,F,,100.00
2191,United Kingdom of Great Britain and Northern I...,2014,M,,100.00


In [56]:
(
    education.loc[(education['Primary_ed_completion_percent']
                   .isna())]
    ['Country']
    .value_counts()
)

Italy             12
Iceland           12
Sweden            12
Spain             12
Slovenia          12
Slovakia          12
Portugal          12
Poland            12
Norway            12
Netherlands       12
Luxembourg        12
Lithuania         12
Latvia            12
Austria           12
Ireland           12
United Kingdom    12
Hungary           12
Estonia           12
Belgium           12
Greece            12
Czechia           12
Denmark           12
Cyprus            12
Finland           12
Germany           12
Malta              9
Romania            9
Croatia            9
Bulgaria           9
Switzerland        9
Georgia            3
Serbia             3
Canada             3
Australia          3
Name: Country, dtype: int64

#### Bringing in the other data sets

In [57]:
gdp_df = pd.read_csv('../data/gdp_per_capita.csv', nrows=6868)
#This prevents bringing in the rows containing footnotes at the bottom of the file

continents = pd.read_csv('../data/continents.csv')

life_expectancy = pd.read_csv('../data/life_expectancy.csv', header=2)

In [58]:
#Starting the process of some data cleaning in terms of country names
#Also limiting to the years from 2000 on since that's when my mdr data starts

gdp_df = gdp_df.drop(columns = 'Value Footnotes')
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']
gdp_df['Country'] = gdp_df['Country'].str.replace('The', '')
gdp_df['Country'] = gdp_df['Country'].str.replace('Democratic Republic of the', 'Dem. Rep.')
gdp_df = gdp_df.loc[gdp_df['Year'] > 1999]

continents = continents.loc[~continents['Country'].str.contains('The')]

life_expectancy = life_expectancy.drop(columns = ['Country Code', 'Indicator Name', 
                                                  'Indicator Code', '2021', 'Unnamed: 66'])
life_expectancy = life_expectancy.melt(id_vars=['Country Name']).dropna()
life_expectancy.columns = ['Country', 'Year', 'Life_Expectancy']
life_expectancy['Year'] = life_expectancy['Year'].astype(int)
life_expectancy = life_expectancy.loc[life_expectancy['Year'] > 1999]

#### Looking at and addressing mismatched country names

In [59]:
pattern = '|'.join([', The', ', Rep.', ', RB', ', Arab Rep.', ', Islamic Rep.',
                    ' \(Islamic Republic of\)', ' \(Dutch part\)', ' of America', ' of Great Britain and Northern Ireland',
                    'United Republic of ', 'Republic of '])

country_map = {
    'Democratic the Congo' : 'Dem. Rep. Congo',
    'Congo, Dem. Rep.' : 'Dem. Rep. Congo',
    'Burma (Myanmar)' : 'Myanmar',
    'Kyrgyzstan' : 'Kyrgyz Republic',
    'Viet Nam' : 'Vietnam',
    'Czech Republic' : 'Czechia',
    'Lao People\'s Democratic Republic' : 'Lao PDR',
    'St. Lucia' : 'Saint Lucia',
    'St. Vincent and the Grenadines' : 'Saint Vincent and the Grenadines',
    'Syria' : 'Syrian Arab Republic',
    'Slovak Republic' : 'Slovakia',
    'Russia' : 'Russian Federation',
    'CÃ´te d\'Ivoire' : 'Côte d\'Ivoire',
    'Cote d\'Ivoire' : 'Côte d\'Ivoire',
    'Ivory Coast' : 'Côte d\'Ivoire' 
    
}

def country_name_counts(df_name, col):
    """Pulling out country names and value counts from each dataframe"""
    df_name['Country'] = df_name['Country'].str.replace(pattern, '', regex=True)
    df_name['Country'] = df_name['Country'].str.strip()
    df_name['Country'] = df_name['Country'].replace(country_map)
    df_name = df_name.loc[~df_name['Country'].str.contains('Europe') &
                          ~df_name['Country'].str.contains('Asia') &
                          ~df_name['Country'].str.contains('Latin America') &
                          ~df_name['Country'].str.contains('\(') &
                          ~df_name['Country'].str.contains('countries') &
                          ~df_name['Country'].str.contains('Central') & 
                          ~df_name['Country'].str.contains('Western') &
                          ~df_name['Country'].str.contains('Sub') &
                          ~df_name['Country'].str.contains('World') &
                          ~df_name['Country'].str.contains('Korea') & 
                          ~df_name['Country'].str.contains('income')]

    df_name = df_name.drop_duplicates()
    co_df = (
        df_name['Country']
        .value_counts()
        .to_frame()
        .reset_index()
        .rename(columns = {'index' : 'Country',
                           'Country' : f'Count_{col}'})
    )
    return co_df

In [60]:
gdp_co = country_name_counts(gdp_df, "gdp")
le_co = country_name_counts(life_expectancy, "le")
continent_co = country_name_counts(continents, "continent")
ed_co = country_name_counts(education, "ed")
mdr_co = country_name_counts(mdr, "mdr")

In [61]:
country_dfs = [gdp_co, le_co, continent_co, ed_co, mdr_co]

all_countries = (
    reduce(lambda  left,right: pd.merge(left,right,on=['Country'],
                                        how='outer'), country_dfs)
)

mismatch = (
    all_countries.loc[all_countries.isna()
                      .any(axis=1)]
    .sort_values('Country')
    .reset_index(drop=True)
)

mismatch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          104 non-null    object 
 1   Count_gdp        62 non-null     float64
 2   Count_le         75 non-null     float64
 3   Count_continent  59 non-null     float64
 4   Count_ed         8 non-null      float64
 5   Count_mdr        45 non-null     float64
dtypes: float64(5), object(1)
memory usage: 5.0+ KB


#### I decided to focus on those rows that contained education completion data but were missing other data as a primary means of identifying country names to address. I was able to find a fix for those that had missing data due to mismatched country names. Those that remain are those that are truly missing other data values.

In [62]:
mismatch.loc[~mismatch['Count_ed'].isna()]

Unnamed: 0,Country,Count_gdp,Count_le,Count_continent,Count_ed,Count_mdr
16,Cuba,,21.0,1.0,9.0,18.0
79,Sao Tome and Principe,,21.0,1.0,6.0,18.0
86,Somalia,,21.0,1.0,3.0,18.0
88,South Sudan,,21.0,1.0,3.0,18.0
91,State of Palestine,,,,6.0,18.0
93,Syrian Arab Republic,,21.0,1.0,3.0,18.0
95,Turkey,20.0,,1.0,9.0,18.0
103,Yemen,,21.0,1.0,6.0,18.0


#### Merging all the data sets

In [63]:
def clean_up_countries(df_name):
    clean_df = df_name
    clean_df['Country'] = clean_df['Country'].str.replace(pattern, '', regex=True)
    clean_df['Country'] = clean_df['Country'].str.strip()
    clean_df['Country'] = clean_df['Country'].replace(country_map)
    clean_df = clean_df.loc[~clean_df['Country'].str.contains('Europe') &
                            ~clean_df['Country'].str.contains('Asia') &
                            ~clean_df['Country'].str.contains('Latin America') &
                            ~clean_df['Country'].str.contains('\(') &
                            ~clean_df['Country'].str.contains('countries') &
                            ~clean_df['Country'].str.contains('Central') & 
                            ~clean_df['Country'].str.contains('Western') &
                            ~clean_df['Country'].str.contains('Sub') &
                            ~clean_df['Country'].str.contains('World') &
                            ~clean_df['Country'].str.contains('Korea') & 
                            ~clean_df['Country'].str.contains('income')]
    clean_df = clean_df.drop_duplicates()
    return clean_df

In [64]:
gdp_clean = clean_up_countries(gdp_df)
le_clean = clean_up_countries(life_expectancy)
continent_clean = clean_up_countries(continents)
ed_clean = clean_up_countries(education)
mdr_clean = clean_up_countries(mdr)

In [65]:
#Pulling together all countries that have GDP and/or LE data in any given year
gdp_le = (
    pd.merge(gdp_clean, le_clean, how='outer', on=['Country', 'Year'])
    .dropna(subset=['GDP_Per_Capita', 'Life_Expectancy'], how='all')
)

gdp_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4751 entries, 0 to 4750
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          4751 non-null   object 
 1   Year             4751 non-null   int64  
 2   GDP_Per_Capita   4106 non-null   float64
 3   Life_Expectancy  4498 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 185.6+ KB


In [66]:
continents = continents.drop_duplicates()

gdp_le = pd.merge(gdp_le, continent_clean, how='left', on='Country')
gdp_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4751 entries, 0 to 4750
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          4751 non-null   object 
 1   Year             4751 non-null   int64  
 2   GDP_Per_Capita   4106 non-null   float64
 3   Life_Expectancy  4498 non-null   float64
 4   Continent        4103 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 222.7+ KB


In [67]:
ed_mdr = pd.merge(ed_clean, mdr_clean, how='outer', on=['Country', 'Year'])
ed_mdr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4666 entries, 0 to 4665
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          4666 non-null   object 
 1   Year                             4666 non-null   int64  
 2   Sex                              1776 non-null   object 
 3   Primary_ed_completion_percent    1419 non-null   float64
 4   Secondary_ed_completion_percent  1776 non-null   float64
 5   Maternal_Death_Rate              4534 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 255.2+ KB


In [68]:
full_df = (
    pd.merge(gdp_le, ed_mdr, how='right',on=['Country', 'Year'])
    .sort_values(['Country', 'Year'])
    .reset_index(drop=True)
)

full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4666 entries, 0 to 4665
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          4666 non-null   object 
 1   Year                             4666 non-null   int64  
 2   GDP_Per_Capita                   4262 non-null   float64
 3   Life_Expectancy                  4404 non-null   float64
 4   Continent                        4428 non-null   object 
 5   Sex                              1776 non-null   object 
 6   Primary_ed_completion_percent    1419 non-null   float64
 7   Secondary_ed_completion_percent  1776 non-null   float64
 8   Maternal_Death_Rate              4534 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 328.2+ KB


In [69]:
full_df.describe()

Unnamed: 0,Year,GDP_Per_Capita,Life_Expectancy,Primary_ed_completion_percent,Secondary_ed_completion_percent,Maternal_Death_Rate
count,4666.0,4262.0,4404.0,1419.0,1776.0,4534.0
mean,2009.25718,17274.466992,69.547661,79.253765,68.588548,203.084693
std,5.123625,18582.540133,9.187309,21.40335,27.63854,292.451125
min,2000.0,630.701542,39.441,10.01228,3.33,2.0
25%,2005.0,3707.407769,63.433,66.13456,47.09216,17.0
50%,2010.0,10927.644214,72.024,88.55,75.27,67.0
75%,2014.0,23854.516404,76.221,96.086785,94.1325,280.75
max,2019.0,115256.016699,84.099756,100.0,100.0,2480.0


#### Looking at country and year counts, accounting for duplication from the different values in the sex column

In [70]:
full_df.loc[~full_df['Sex'].isin(['M', 'F'])]['Country'].value_counts()

Senegal               20
Paraguay              19
Dem. Rep. Congo       19
Dominican Republic    19
Ecuador               19
                      ..
Hungary               18
Iceland               18
India                 18
Indonesia             18
Libya                 18
Name: Country, Length: 191, dtype: int64

In [71]:
full_df.loc[~full_df['Sex'].isin(['M', 'F'])]['Year'].value_counts().sort_index()

2000    191
2001    191
2002    191
2003    191
2004    191
2005    191
2006    191
2007    191
2008    191
2009    191
2010    191
2011    191
2012    191
2013    191
2014    191
2015    191
2016    191
2017    191
2018     35
2019      9
Name: Year, dtype: int64

#### Looking for years that have the most education and mdr data and trends in the data over time

In [72]:
full_df.groupby('Year')['Primary_ed_completion_percent'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,0.0,,,,,,,
2001,0.0,,,,,,,
2002,0.0,,,,,,,
2003,0.0,,,,,,,
2004,60.0,69.548162,23.03131,12.91138,57.7836,76.395,89.7225,94.91
2005,105.0,73.794092,27.510878,10.01228,58.99419,84.73,97.31,100.0
2006,147.0,75.291908,24.510565,13.23251,60.11984,83.02,95.70803,100.0
2007,66.0,73.126039,23.44551,20.34787,60.064187,79.237005,92.395,99.80467
2008,63.0,78.612033,21.58281,25.18382,69.103725,89.5,93.79,98.26377
2009,75.0,79.789139,18.640484,24.51903,66.19239,85.96619,94.395345,98.46702


In [73]:
full_df.groupby('Year')['Secondary_ed_completion_percent'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,0.0,,,,,,,
2001,0.0,,,,,,,
2002,0.0,,,,,,,
2003,0.0,,,,,,,
2004,60.0,46.400198,24.940393,4.35233,23.60103,56.40276,68.583772,79.33073
2005,180.0,72.293808,30.937473,3.33,56.1375,89.46,97.77,100.0
2006,147.0,56.300485,28.867384,3.74381,32.372,57.18,80.459895,99.25
2007,66.0,55.049381,23.919697,8.50965,38.286342,51.21401,74.93,98.79439
2008,63.0,59.174363,23.899064,6.14208,46.961825,68.14,76.945,90.37
2009,75.0,59.412659,21.638568,9.98548,44.797085,66.43,75.370075,95.7


In [74]:
full_df.groupby('Year')['Maternal_Death_Rate'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,191.0,270.858639,387.849626,3.0,25.0,79.0,435.5,2480.0
2001,191.0,262.602094,372.977167,3.0,23.5,76.0,422.0,2250.0
2002,191.0,255.78534,360.956538,3.0,23.5,74.0,401.0,2080.0
2003,191.0,247.366492,347.015337,3.0,21.0,76.0,382.5,1960.0
2004,231.0,260.822511,365.148497,3.0,32.0,81.0,381.0,1850.0
2005,311.0,209.321543,321.282279,3.0,11.0,54.0,286.0,1760.0
2006,289.0,230.179931,291.48733,2.0,31.0,85.0,361.0,1680.0
2007,235.0,223.629787,292.622513,2.0,28.5,84.0,335.5,1610.0
2008,233.0,218.819742,302.785391,2.0,26.0,80.0,314.0,1530.0
2009,241.0,194.580913,258.925413,2.0,23.0,75.0,274.0,1450.0


In [75]:
(
    full_df
    .groupby('Year')
    [['Primary_ed_completion_percent', 'Secondary_ed_completion_percent', 'Maternal_Death_Rate']]
    .count()
)

Unnamed: 0_level_0,Primary_ed_completion_percent,Secondary_ed_completion_percent,Maternal_Death_Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,0,0,191
2001,0,0,191
2002,0,0,191
2003,0,0,191
2004,60,60,231
2005,105,180,311
2006,147,147,289
2007,66,66,235
2008,63,63,233
2009,75,75,241


In [96]:
full_df.loc[full_df['Sex'] == 'F']['Primary_ed_completion_percent'].corr(full_df['Maternal_Death_Rate'])

-0.7870245052268181

In [76]:
full_df.loc[full_df['Sex'] == 'F']['Secondary_ed_completion_percent'].corr(full_df['Maternal_Death_Rate'])

-0.7709490227930015

In [93]:
full_df.loc[~full_df['Sex'].isin(['M','F'])]['GDP_Per_Capita'].corr(full_df['Maternal_Death_Rate'])

-0.47047400940066997

In [94]:
full_df.loc[~full_df['Sex'].isin(['M','F'])]['Life_Expectancy'].corr(full_df['Maternal_Death_Rate'])

-0.8280058856385359

#### Pulling out 2005, 2010, and 2014 because they have large number of observations across all three variables

In [78]:
full_pivoted = (
    pd.pivot_table(full_df.loc[full_df['Year']
                              .isin([2005, 2010, 2014])],
                   index=['Country'], 
                   columns=['Year'])
    .dropna()
)
full_pivoted

Unnamed: 0_level_0,GDP_Per_Capita,GDP_Per_Capita,GDP_Per_Capita,Life_Expectancy,Life_Expectancy,Life_Expectancy,Maternal_Death_Rate,Maternal_Death_Rate,Maternal_Death_Rate,Primary_ed_completion_percent,Primary_ed_completion_percent,Primary_ed_completion_percent,Secondary_ed_completion_percent,Secondary_ed_completion_percent,Secondary_ed_completion_percent
Year,2005,2010,2014,2005,2010,2014,2005,2010,2014,2005,2010,2014,2005,2010,2014
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Cambodia,2120.551877,2716.698409,3364.278729,63.088,66.56,68.273,351.0,248.0,189.0,58.94956,71.08916,72.409983,26.766637,36.631527,40.510197
Colombia,10083.314361,11823.387936,13899.368829,74.265,75.424,76.322,83.0,85.0,85.0,84.716367,90.637597,91.936667,62.29701,72.49546,74.603333
Costa Rica,13925.078819,16448.287478,18168.715242,78.117,78.769,79.398,33.0,32.0,29.0,90.626667,91.28,94.523333,50.82,61.356667,66.583333
Dominican Republic,10291.30888,12782.451899,14499.636116,70.771,72.046,73.003,83.0,96.0,94.0,81.63,85.83,91.521523,72.146667,79.366667,82.86837
Ecuador,9503.68919,10340.968231,12078.469844,74.112,75.089,75.923,94.0,78.0,65.0,92.176667,95.623333,97.666667,67.646667,79.01,86.983333
El Salvador,6870.622477,7328.713216,7990.444615,70.124,71.21,72.175,62.0,54.0,49.0,78.546667,82.353333,88.413333,59.93,65.21,72.483333
Honduras,4546.238807,4866.972176,5177.409416,72.026,73.317,74.278,77.0,74.0,68.0,72.78,80.9,82.546667,33.593333,46.083333,48.503333
Panama,16304.59881,21347.332749,27357.623072,75.892,76.792,77.583,88.0,79.0,60.0,91.823333,92.216753,94.13,73.543333,72.946797,77.626667
Paraguay,8568.54755,10405.230941,11612.389911,71.667,72.653,73.473,137.0,108.0,91.0,84.803333,87.186667,89.703333,64.026667,69.776667,76.743333
Peru,7519.02772,10066.469647,11877.084063,72.908,74.41,75.529,118.0,104.0,96.0,90.106667,93.433333,95.73,76.276667,82.703333,87.373333


In [79]:
key_years = (
    full_df
    .loc[full_df['Year']
         .isin([2005, 2010, 2014])]
)

key_years['Year'] = key_years['Year'].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [80]:
key_years.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 919 entries, 5 to 4657
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          919 non-null    object 
 1   Year                             919 non-null    object 
 2   GDP_Per_Capita                   845 non-null    float64
 3   Life_Expectancy                  871 non-null    float64
 4   Continent                        876 non-null    object 
 5   Sex                              519 non-null    object 
 6   Primary_ed_completion_percent    348 non-null    float64
 7   Secondary_ed_completion_percent  519 non-null    float64
 8   Maternal_Death_Rate              919 non-null    float64
dtypes: float64(5), object(4)
memory usage: 71.8+ KB


#### Plots looking at scatter and distribution for MDR and Primary Education completion in those key years

In [81]:
df = key_years.loc[(key_years['Sex'] == 'F')]
fig = px.scatter(df,
                 x="Primary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white",
                 hover_data=["Continent", "Country"])
fig.show()

#### Now the same for MDR and Secondary Education completion

In [82]:
df = key_years.loc[(key_years['Sex'] == 'F')]
fig = px.scatter(df,
                 x="Secondary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white",
                 hover_data=["Continent", "Country"])
fig.show()

#### Facet grid, adding in GDP as dot size

In [83]:
df = key_years[~key_years['Sex'].isna()
              & ~key_years['GDP_Per_Capita'].isna()]
fig =  px.scatter(df, 
                  x="Primary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Year",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  facet_row='Continent',
                  height=1500,
                  hover_data=["Country"])
    
fig.show()

In [84]:
df = key_years[~key_years['Sex'].isna()
              & ~key_years['GDP_Per_Capita'].isna()]
fig =  px.scatter(df, 
                  x="Secondary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Year",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  facet_row='Continent',
                  height=1500,
                  hover_data=["Country"])
    
fig.show()

#### Looking at trends over 10 years, adding in the dimension of life expectancy via color

In [85]:
#I chose these 10 years because there are comparatively higher number of values for education completion and mdr
ten_years = (
    full_df
    .loc[(full_df['Year'] >= 2005) & (full_df['Year']<=2014)]
)

ten_years.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2816 entries, 5 to 4657
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2816 non-null   object 
 1   Year                             2816 non-null   int64  
 2   GDP_Per_Capita                   2582 non-null   float64
 3   Life_Expectancy                  2668 non-null   float64
 4   Continent                        2682 non-null   object 
 5   Sex                              1359 non-null   object 
 6   Primary_ed_completion_percent    1002 non-null   float64
 7   Secondary_ed_completion_percent  1359 non-null   float64
 8   Maternal_Death_Rate              2816 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 220.0+ KB


In [86]:
ten_year_counts = (
    ten_years
    .groupby(['Country'])
    [['Primary_ed_completion_percent', 'Secondary_ed_completion_percent', 'Maternal_Death_Rate']]
    .count()
    .reset_index()
)

filtered_ten_yr_counts = (
    ten_year_counts.loc[(ten_year_counts['Primary_ed_completion_percent'] >= 6) 
                        & 
                        (ten_year_counts['Secondary_ed_completion_percent'] >= 6)]
)

ten_year_countries = filtered_ten_yr_counts['Country'].to_list()

In [87]:
filtered_ten_years = ten_years.loc[ten_years['Country'].isin(ten_year_countries)]
filtered_ten_years['Country'].value_counts()                                                       

Paraguay              30
Peru                  30
Ecuador               30
Dominican Republic    30
Costa Rica            30
                      ..
Montenegro            14
Mozambique            14
Namibia               14
Niger                 14
Lao PDR               14
Name: Country, Length: 87, dtype: int64

In [88]:
#dropping any rows that don't have data for all variables of interest
df = filtered_ten_years.dropna(how='any', 
                      subset=['Maternal_Death_Rate',
                              'Primary_ed_completion_percent',
                              'Life_Expectancy',
                              'GDP_Per_Capita', 
                              'Sex'])

fig =  px.scatter(df, 
                  x="Primary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Life_Expectancy",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  facet_row="Year",
                  height=2500,
                  width=1000,
                  labels={
                      "Primary_ed_completion_percent" : "% Completing Primary Ed",
                      "Maternal_Death_Rate" : "Maternal deaths/100K live births",
                      "Life_Expectancy" : "Life Expectancy(years)",
                      "GDP_Per_Capita" : "GDP per capita"
                  },
                  category_orders={
                      "Year" : [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
                  },
                  hover_name="Country",
                  hover_data={"Maternal_Death_Rate":True,
                              "Primary_ed_completion_percent":":.2f",
                              "GDP_Per_Capita":":$.2f", 
                              "Life_Expectancy":":.2f",
                              "Continent":True, 
                              "Year":False,
                              "Sex":False})

fig.update_layout(title_text="Maternal Death Rates, Primary Education Completion Rates, GDP, and Life Expectancy",
                 title_xanchor="auto",
                 title_xref="paper")

fig.update_xaxes(title_font_size=12)

fig.update_yaxes(title_font_size=12)

fig.update_coloraxes(colorbar_len=.75, 
                     colorbar_title_font_size=12,
                     colorbar=dict(orientation="h"),
                     colorbar_y=-.025,
                     colorbar_yanchor="top")
    
fig.show()

In [89]:
#dropping any rows that don't have data for all variables of interest
df = filtered_ten_years.dropna(how='any', 
                      subset=['Maternal_Death_Rate',
                              'Secondary_ed_completion_percent',
                              'Life_Expectancy',
                              'GDP_Per_Capita', 
                              'Sex'])

fig =  px.scatter(df, 
                  x="Secondary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Life_Expectancy",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  facet_row="Year",
                  height=2500,
                  width=1000,
                  labels={
                      "Secondary_ed_completion_percent" : "% Completing Secondary Ed",
                      "Maternal_Death_Rate" : "Maternal deaths/100K live births",
                      "Life_Expectancy" : "Life Expectancy(years)",
                      "GDP_Per_Capita" : "GDP per capita"
                  },
                  category_orders={
                      "Year" : [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
                  },
                  hover_name="Country",
                  hover_data={"Maternal_Death_Rate":True,
                              "Secondary_ed_completion_percent":":.2f",
                              "GDP_Per_Capita":":$.2f",
                              "Life_Expectancy":":.2f",
                              "Continent":True, 
                              "Year":False,
                              "Sex":False})

fig.update_layout(title_text="Maternal Death Rates, Secondary Education Completion Rates, GDP, and Life Expectancy",
                 title_xanchor="auto",
                 title_xref="paper")

fig.update_xaxes(title_font_size=12)

fig.update_yaxes(title_font_size=12)

fig.update_coloraxes(colorbar_len=.75, 
                     colorbar_title_font_size=12,
                     colorbar=dict(orientation="h"),
                     colorbar_y=-.025,
                     colorbar_yanchor="top")
    
fig.show()

In [90]:
@interact(continent = filtered_ten_years['Continent'].unique())
def interactive_plot(continent):

    df = (
        filtered_ten_years.loc[(filtered_ten_years['Sex'] == 'F')
                               &
                               (filtered_ten_years['Continent'] == continent)]
        .dropna(how='any',
                subset=['Maternal_Death_Rate',
                        'Primary_ed_completion_percent',
                        'Life_Expectancy',
                        'GDP_Per_Capita'])
    )

    fig =  px.scatter(df, 
                      x="Primary_ed_completion_percent", 
                      y="Maternal_Death_Rate", 
                      color="Life_Expectancy",
                      size="GDP_Per_Capita",
                      facet_row="Year",
                      height=2500,
                      width=600,
                      labels={
                          "Primary_ed_completion_percent" : "% Completing Primary Ed",
                          "Maternal_Death_Rate" : "Maternal deaths/100K live births",
                          "Life_Expectancy" : "Life Expectancy(years)",
                          "GDP_Per_Capita" : "GDP per capita"
                      },
                      category_orders={
                          "Year" : [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
                      },
                      hover_name="Country",
                      hover_data={"Maternal_Death_Rate":True,
                                  "Primary_ed_completion_percent":":.2f",
                                  "GDP_Per_Capita":":$.2f", 
                                  "Life_Expectancy":":.2f",
                                  "Continent":True, 
                                  "Year":False,
                                  "Sex":False})

    fig.update_layout(title_text="Maternal Death Rates, Primary Ed. Completion Rates, GDP, and Life Expectancy",
                      title_xanchor="auto",
                      title_font_size=14
                     #title_xref="paper"
                     )

    fig.update_xaxes(title_font_size=12)

    fig.update_yaxes(title_font_size=12)

    fig.update_coloraxes(colorbar_len=1, 
                         colorbar_title_font_size=12,
                         colorbar=dict(orientation="h"),
                         colorbar_y=-.025,
                         colorbar_yanchor="top")

    fig.show()

interactive(children=(Dropdown(description='continent', options=('Europe', 'Asia', 'North America', 'Africa', …

In [91]:
@interact(Year = filtered_ten_years['Year'].unique())
def interactive_plot(Year):

    df = (
        filtered_ten_years.loc[(filtered_ten_years['Sex'] == 'F')
                               &
                               (filtered_ten_years['Year'] == Year)]
        .dropna(how='any',
                subset=['Maternal_Death_Rate',
                        'Primary_ed_completion_percent',
                        'Life_Expectancy',
                        'GDP_Per_Capita'])
    )

    fig =  px.scatter(df, 
                      x="Primary_ed_completion_percent", 
                      y="Maternal_Death_Rate", 
                      color="Life_Expectancy",
                      size="GDP_Per_Capita",
                      #facet_row="Continent",
                      height=600,
                      width=600,
                      labels={
                          "Primary_ed_completion_percent" : "% Completing Primary Ed",
                          "Maternal_Death_Rate" : "Maternal deaths/100K live births",
                          "Life_Expectancy" : "Life Expectancy(years)",
                          "GDP_Per_Capita" : "GDP per capita"
                      },
                      #category_orders={
                      #    "Year" : [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
                      #},
                      hover_name="Country",
                      hover_data={"Maternal_Death_Rate":True,
                                  "Primary_ed_completion_percent":":.2f",
                                  "GDP_Per_Capita":":$.2f", 
                                  "Life_Expectancy":":.2f",
                                  "Continent":True, 
                                  #"Year":False,
                                  "Sex":False})

    fig.update_layout(title_text="Maternal Death Rates, Primary Ed. Completion Rates, GDP, and Life Expectancy",
                      title_xanchor="auto",
                      title_font_size=14
                     #title_xref="paper"
                     )

    fig.update_xaxes(title_font_size=12)

    fig.update_yaxes(title_font_size=12)

    fig.update_coloraxes(colorbar_len=1, 
                         colorbar_title_font_size=12,
                         colorbar=dict(orientation="h"),
                         colorbar_y=-.025,
                         colorbar_yanchor="top")

    fig.show()

interactive(children=(Dropdown(description='Year', options=(2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 20…