In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests
from functools import reduce

%matplotlib inline

### Solo Exploration and Presentation:
#### 1. Choose and download another data set from the UN data to explore.
[http://data.un.org/Explorer.aspx](http://data.un.org/Explorer.aspx)   
You may want to combine your new dataset with one or both of the datasets that you already worked with. Prepare a short (< 5 minute) presentation of your findings. Report any interesting correlations or trends that you find. 
#### 2. If time allows, check out the plotly library to add additional interativity to your plots. 
[https://plotly.com/python/plotly-express/](https://plotly.com/python/plotly-express/)

#### Primary Education Completion Rates

In [16]:
endpoint1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/Ief9e0d38__Sc5211ad2_1644293535310441_tbl/FeatureServer/0/query?where=income_wealth_quantile%20%3D%20\'_T\'%20AND%20urbanization%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=time_period,sex,sex_desc,education_lev_desc,urbanization_desc,income_wealth_quantile_desc,obs_value,unit_measure_desc,indicator_desc,ref_area_desc&returnGeometry=false&outSR=&f=json'

endpoint2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/Ief9e0d38__Sc5211ad2_1644293535310441_tbl/FeatureServer/0/query?where=income_wealth_quantile%20%3D%20\'_T\'%20AND%20urbanization%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=time_period,sex,sex_desc,education_lev_desc,urbanization_desc,income_wealth_quantile_desc,obs_value,unit_measure_desc,indicator_desc,ref_area_desc&returnGeometry=false&resultOffset=1000&outSR=&f=json'

In [17]:
pe_res1 = requests.get(endpoint1).json()
pe_res2 = requests.get(endpoint2).json()

In [18]:
pe_df1 = pd.DataFrame(pd.DataFrame(pe_res1['features'])['attributes'].values.tolist())
pe_df2 = pd.DataFrame(pd.DataFrame(pe_res2['features'])['attributes'].values.tolist())

primary_ed = pd.concat([pe_df1, pe_df2])
primary_ed = primary_ed[['ref_area_desc', 'time_period', 'sex', 'obs_value']]
primary_ed.columns = ['Country', 'Year', 'Sex', 'Primary_ed_completion_percent']
primary_ed = primary_ed.sort_values(['Country', 'Year', 'Sex'])
primary_ed

Unnamed: 0,Country,Year,Sex,Primary_ed_completion_percent
625,Afghanistan,2011,F,26.63788
228,Afghanistan,2011,M,53.74893
26,Afghanistan,2011,_T,40.72569
577,Afghanistan,2015,F,40.35416
229,Afghanistan,2015,M,67.30569
...,...,...,...,...
834,Zimbabwe,2015,M,87.02327
623,Zimbabwe,2015,_T,88.21310
227,Zimbabwe,2019,F,91.62107
835,Zimbabwe,2019,M,86.35518


#### Secondary Education Completion

In [19]:
se1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&outSR=&f=json'
se2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&resultOffset=1000&outSR=&f=json'
se3 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I66c21045__Sa7501ddf_16442936354058118_tbl/FeatureServer/0/query?where=urbanization%20%3D%20\'_T\'%20AND%20income_wealth_quantile%20%3D%20\'_T\'%20AND%20time_period%20%3E%3D%202004%20AND%20time_period%20%3C%3D%202019&outFields=indicator_desc,ref_area_desc,time_period,sex,education_lev_desc,obs_value,unit_measure_desc&returnGeometry=false&resultOffset=2000&outSR=&f=json'

se_res1 = requests.get(se1).json()
se_res2 = requests.get(se2).json()
se_res3 = requests.get(se3).json()

In [20]:
se_df1 = pd.DataFrame(pd.DataFrame(se_res1['features'])['attributes'].values.tolist())
se_df2 = pd.DataFrame(pd.DataFrame(se_res2['features'])['attributes'].values.tolist())
se_df3 = pd.DataFrame(pd.DataFrame(se_res3['features'])['attributes'].values.tolist())

secondary_ed = pd.concat([se_df1, se_df2, se_df3])
secondary_ed = secondary_ed[['ref_area_desc', 'time_period', 'sex', 'obs_value']]
secondary_ed.columns = ['Country', 'Year', 'Sex', 'Secondary_ed_completion_percent']
secondary_ed = secondary_ed.sort_values(['Country', 'Year', 'Sex'])

secondary_ed

Unnamed: 0,Country,Year,Sex,Secondary_ed_completion_percent
768,Afghanistan,2011,F,11.50881
512,Afghanistan,2011,M,34.65255
19,Afghanistan,2011,_T,23.37700
769,Afghanistan,2015,F,25.17210
464,Afghanistan,2015,M,48.71595
...,...,...,...,...
164,Zimbabwe,2015,M,72.22249
766,Zimbabwe,2015,_T,72.62408
511,Zimbabwe,2019,F,54.36128
165,Zimbabwe,2019,M,53.50781


#### Maternal Death Rate per 100,000 live births

In [21]:
mdr1 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&f=json'
mdr2 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=1000&f=json'
mdr3 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=2000&f=json'
mdr4 = 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/I75250979__S75250979_16442942938748052_tbl/FeatureServer/0/query?where=1%3D1&outFields=indicator_desc,ref_area_desc,time_period,obs_value,unit_measure_desc&returnGeometry=false&outSR=&resultOffset=3000&f=json'

In [22]:
mdr_res1 = requests.get(mdr1).json()
mdr_res2 = requests.get(mdr2).json()
mdr_res3 = requests.get(mdr3).json()
mdr_res4 = requests.get(mdr4).json()

In [114]:
type(mdr_res1['features'])

list

In [23]:
mdr_df1 = pd.DataFrame(pd.DataFrame(mdr_res1['features'])['attributes'].values.tolist())
mdr_df2 = pd.DataFrame(pd.DataFrame(mdr_res2['features'])['attributes'].values.tolist())
mdr_df3 = pd.DataFrame(pd.DataFrame(mdr_res3['features'])['attributes'].values.tolist())
mdr_df4 = pd.DataFrame(pd.DataFrame(mdr_res4['features'])['attributes'].values.tolist())

mdr = pd.concat([mdr_df1, mdr_df2, mdr_df3, mdr_df4])
mdr =(
    mdr.drop(columns = ['indicator_desc', 'unit_measure_desc'])
    .rename(columns = {'ref_area_desc' : 'Country',
                      'time_period' : 'Year',
                      'obs_value' : 'Maternal_Death_Rate'})
)

mdr = mdr.sort_values(['Country', 'Year'])

mdr

Unnamed: 0,Country,Year,Maternal_Death_Rate
184,Afghanistan,2000,1450
185,Afghanistan,2001,1390
186,Afghanistan,2002,1300
187,Afghanistan,2003,1240
188,Afghanistan,2004,1180
...,...,...,...
465,Zimbabwe,2013,509
466,Zimbabwe,2014,494
467,Zimbabwe,2015,480
468,Zimbabwe,2016,468


#### Merging the two education tables

In [24]:
education = pd.merge(primary_ed, secondary_ed, how='outer', on=['Country', 'Year', 'Sex'])

#Looking at rows that don't have primary education rates listed
(
    education.loc[(education['Primary_ed_completion_percent']
                   .isna())]
)

Unnamed: 0,Country,Year,Sex,Primary_ed_completion_percent,Secondary_ed_completion_percent
1836,Australia,2010,F,,99.53
1837,Australia,2010,M,,98.82
1838,Australia,2010,_T,,99.15
1839,Austria,2005,F,,98.59
1840,Austria,2005,M,,98.73
...,...,...,...,...,...
2188,United Kingdom of Great Britain and Northern I...,2013,M,,100.00
2189,United Kingdom of Great Britain and Northern I...,2013,_T,,100.00
2190,United Kingdom of Great Britain and Northern I...,2014,F,,100.00
2191,United Kingdom of Great Britain and Northern I...,2014,M,,100.00


#### Bringing in the other data sets

In [25]:
gdp_df = pd.read_csv('../data/gdp_per_capita.csv', nrows=6868)
#This prevents bringing in the rows containing footnotes at the bottom of the file

continents = pd.read_csv('../data/continents.csv')

life_expectancy = pd.read_csv('../data/life_expectancy.csv', header=2)

In [26]:
gdp_df = gdp_df.drop(columns = 'Value Footnotes')
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']
gdp_df['Country'] = gdp_df['Country'].str.replace('The', '')
gdp_df['Country'] = gdp_df['Country'].str.replace('Democratic Republic of the', 'Dem. Rep.')

continents = continents.loc[~continents['Country'].str.contains('The')]

life_expectancy = life_expectancy.drop(columns = ['Country Code', 'Indicator Name', 
                                                  'Indicator Code', '2021', 'Unnamed: 66'])
life_expectancy = life_expectancy.melt(id_vars=['Country Name']).dropna()
life_expectancy.columns = ['Country', 'Year', 'Life_Expectancy']
life_expectancy['Year'] = life_expectancy['Year'].astype(int)
life_expectancy = life_expectancy.loc[life_expectancy['Year'] > 1989]

#### Looking at mismatched country names

In [29]:
pattern = '|'.join([', The', ', Rep.', ', RB', ', Arab Rep.', ', Islamic Rep.', 
                    ', Fed. Sts.', ' \(Plurinational State of\)', ' \(Bolivarian Republic of\)',
                   ' \(Islamic Republic of\)', ' \(Dutch part\)'])

country_map = {
    'Democratic Republic of the Congo' : 'Dem. Rep. Congo',
    'Congo, Dem. Rep.' : 'Dem. Rep. Congo',
    'Burma (Myanmar)' : 'Myanmar',
    'Kyrgyzstan' : 'Kyrgyz Republic'
}

def country_name_counts(df_name, col):
    """Pulling out country names and value counts from each dataframe"""
    df_name['Country'] = df_name['Country'].str.replace(pattern, '', regex=True)
    df_name['Country'] = df_name['Country'].str.strip()
    df_name['Country'] = df_name['Country'].replace(country_map)
    df_name = df_name.loc[~df_name['Country'].str.contains('Europe') &
                          ~df_name['Country'].str.contains('Asia') &
                          ~df_name['Country'].str.contains('Latin America') &
                          ~df_name['Country'].str.contains('\(') &
                          ~df_name['Country'].str.contains('countries') &
                          ~df_name['Country'].str.contains('Central') & 
                          ~df_name['Country'].str.contains('Western') &
                          ~df_name['Country'].str.contains('Sub') &
                          ~df_name['Country'].str.contains('World') &
                          ~df_name['Country'].str.contains('Korea') & 
                          ~df_name['Country'].str.contains('income')]

    df_name = df_name.drop_duplicates()
    co_df = (
        df_name['Country']
        .value_counts()
        .to_frame()
        .reset_index()
        .rename(columns = {'index' : 'Country',
                           'Country' : f'Count_{col}'})
    )
    return co_df

In [30]:
gdp_co = country_name_counts(gdp_df, "gdp")
le_co = country_name_counts(life_expectancy, "le")
continent_co = country_name_counts(continents, "continent")
ed_co = country_name_counts(education, "ed")
mdr_co = country_name_counts(mdr, "mdr")

In [31]:
country_dfs = [gdp_co, le_co, continent_co, ed_co, mdr_co]

all_countries = (
    reduce(lambda  left,right: pd.merge(left,right,on=['Country'],
                                        how='outer'), country_dfs)
)

mismatch = (
    all_countries.loc[all_countries.isna()
                      .any(axis=1)]
    .sort_values('Country')
    .reset_index(drop=True)
)

mismatch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          128 non-null    object 
 1   Count_gdp        72 non-null     float64
 2   Count_le         85 non-null     float64
 3   Count_continent  73 non-null     float64
 4   Count_ed         20 non-null     float64
 5   Count_mdr        57 non-null     float64
dtypes: float64(5), object(1)
memory usage: 6.1+ KB


In [32]:
mismatch.head(60)

Unnamed: 0,Country,Count_gdp,Count_le,Count_continent,Count_ed,Count_mdr
0,Africa Eastern and Southern,,31.0,,,
1,Andorra,,,1.0,,
2,Antigua and Barbuda,30.0,31.0,1.0,,18.0
3,Aruba,28.0,31.0,1.0,,
4,Australia and New Zealand,,,,,18.0
5,Bahamas,30.0,31.0,1.0,,18.0
6,Bahrain,30.0,31.0,1.0,,18.0
7,Bermuda,30.0,22.0,1.0,,
8,Brunei,30.0,,1.0,,
9,Brunei Darussalam,,31.0,,,18.0


#### Merging all the data sets

In [33]:
def clean_up_countries(df_name):
    clean_df = df_name
    clean_df['Country'] = clean_df['Country'].str.replace(pattern, '', regex=True)
    clean_df['Country'] = clean_df['Country'].str.strip()
    clean_df['Country'] = clean_df['Country'].replace(country_map)
    clean_df = clean_df.loc[~clean_df['Country'].str.contains('Europe') &
                            ~clean_df['Country'].str.contains('Asia') &
                            ~clean_df['Country'].str.contains('Latin America') &
                            ~clean_df['Country'].str.contains('\(') &
                            ~clean_df['Country'].str.contains('countries') &
                            ~clean_df['Country'].str.contains('Central') & 
                            ~clean_df['Country'].str.contains('Western') &
                            ~clean_df['Country'].str.contains('Sub') &
                            ~clean_df['Country'].str.contains('World') &
                            ~clean_df['Country'].str.contains('Korea') & 
                            ~clean_df['Country'].str.contains('income')]
    clean_df = clean_df.drop_duplicates()
    return clean_df

In [34]:
gdp_clean = clean_up_countries(gdp_df)
le_clean = clean_up_countries(life_expectancy)
continent_clean = clean_up_countries(continents)
ed_clean = clean_up_countries(education)
mdr_clean = clean_up_countries(mdr)

In [37]:
gdp_le = (
    pd.merge(gdp_clean, le_clean, how='outer', on=['Country', 'Year'])
    .dropna(subset=['GDP_Per_Capita', 'Life_Expectancy'], how='all')
)

gdp_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6980 entries, 0 to 6979
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          6980 non-null   object 
 1   Year             6980 non-null   int64  
 2   GDP_Per_Capita   5968 non-null   float64
 3   Life_Expectancy  6615 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 272.7+ KB


In [40]:
continents = continents.drop_duplicates()

gdp_le = pd.merge(gdp_le, continent_clean, how='left', on='Country')
gdp_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6980 entries, 0 to 6979
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          6980 non-null   object 
 1   Year             6980 non-null   int64  
 2   GDP_Per_Capita   5968 non-null   float64
 3   Life_Expectancy  6615 non-null   float64
 4   Continent        5990 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 327.2+ KB


In [41]:
ed_mdr = pd.merge(ed_clean, mdr_clean, how='outer', on=['Country', 'Year'])
ed_mdr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4753 entries, 0 to 4752
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          4753 non-null   object 
 1   Year                             4753 non-null   int64  
 2   Sex                              1851 non-null   object 
 3   Primary_ed_completion_percent    1494 non-null   float64
 4   Secondary_ed_completion_percent  1851 non-null   float64
 5   Maternal_Death_Rate              4618 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 259.9+ KB


In [42]:
full_df = (
    pd.merge(gdp_le, ed_mdr, how='inner',on=['Country', 'Year'])
    .sort_values(['Country', 'Year'])
    .reset_index(drop=True)
)

full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4267 entries, 0 to 4266
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          4267 non-null   object 
 1   Year                             4267 non-null   int64  
 2   GDP_Per_Capita                   4023 non-null   float64
 3   Life_Expectancy                  4243 non-null   float64
 4   Continent                        4229 non-null   object 
 5   Sex                              1743 non-null   object 
 6   Primary_ed_completion_percent    1422 non-null   float64
 7   Secondary_ed_completion_percent  1743 non-null   float64
 8   Maternal_Death_Rate              4132 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 300.1+ KB


In [43]:
full_df.describe()

Unnamed: 0,Year,GDP_Per_Capita,Life_Expectancy,Primary_ed_completion_percent,Secondary_ed_completion_percent,Maternal_Death_Rate
count,4267.0,4023.0,4243.0,1422.0,1743.0,4132.0
mean,2009.304195,16977.377187,69.479796,79.712374,68.659538,204.719748
std,5.118226,18573.487439,9.110026,21.138651,26.904027,295.331433
min,2000.0,630.701542,39.441,10.01228,3.33,2.0
25%,2005.0,3613.29647,63.5885,67.308462,48.37672,18.0
50%,2010.0,10594.278412,71.896,89.225,75.14,70.0
75%,2014.0,22199.467528,75.954829,95.731673,92.324245,284.0
max,2019.0,115256.016699,84.099756,100.0,100.0,2480.0


In [48]:
len(full_df['Country'].value_counts())

170

In [53]:
full_df.loc[~full_df['Sex'].isin(['M', 'F'])]['Country'].value_counts()

Senegal            20
Kyrgyz Republic    19
Mexico             19
Mali               19
Madagascar         19
                   ..
Jamaica            18
Italy              18
Israel             18
Ireland            18
Afghanistan        18
Name: Country, Length: 170, dtype: int64

In [50]:
full_df.loc[~full_df['Sex'].isin(['M', 'F'])]['Year'].value_counts().sort_index()

2000    170
2001    170
2002    170
2003    170
2004    170
2005    170
2006    170
2007    170
2008    170
2009    170
2010    170
2011    170
2012    170
2013    170
2014    170
2015    170
2016    170
2017    170
2018     36
2019      9
Name: Year, dtype: int64

In [51]:
full_df['Year'].value_counts().sort_index()

2000    170
2001    170
2002    170
2003    170
2004    212
2005    286
2006    266
2007    218
2008    216
2009    222
2010    250
2011    298
2012    236
2013    288
2014    306
2015    216
2016    226
2017    212
2018    108
2019     27
Name: Year, dtype: int64

In [57]:
full_df.groupby('Year')['Primary_ed_completion_percent'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,0.0,,,,,,,
2001,0.0,,,,,,,
2002,0.0,,,,,,,
2003,0.0,,,,,,,
2004,63.0,71.909161,22.964333,12.91138,59.059325,82.7,89.745,94.91
2005,108.0,73.979199,27.066587,10.01228,61.571047,85.77366,95.6625,100.0
2006,144.0,76.478085,24.109278,13.23251,61.508608,86.97,95.846303,100.0
2007,72.0,74.627203,22.989402,20.34787,60.164478,83.515955,92.305,99.80467
2008,69.0,79.732728,20.935715,25.18382,71.41012,91.03,93.76,98.26377
2009,78.0,80.160922,18.35253,24.51903,66.681695,87.375,93.775,98.46702


In [58]:
full_df.groupby('Year')['Secondary_ed_completion_percent'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,0.0,,,,,,,
2001,0.0,,,,,,,
2002,0.0,,,,,,,
2003,0.0,,,,,,,
2004,63.0,50.564131,23.957092,4.35233,29.143135,57.29249,71.605,80.49
2005,174.0,70.618273,30.719565,3.33,56.01,82.425,97.0725,100.0
2006,144.0,57.802449,28.702792,3.74381,33.1719,61.5,81.9975,99.25
2007,72.0,57.141933,24.019425,8.50965,38.759855,57.85902,78.79,98.79439
2008,69.0,60.976446,23.618884,6.14208,56.74,69.96,78.12,90.37
2009,78.0,60.534937,21.96422,9.98548,45.541355,67.84267,76.1325,95.7


In [59]:
full_df.groupby('Year')['Maternal_Death_Rate'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,170.0,275.311765,396.672763,3.0,26.0,77.0,436.25,2480.0
2001,170.0,266.823529,381.228602,3.0,24.25,75.5,422.5,2250.0
2002,170.0,259.811765,368.71902,3.0,24.25,74.0,405.5,2080.0
2003,170.0,251.047059,354.266136,3.0,21.0,75.5,388.75,1960.0
2004,212.0,259.995283,368.50858,3.0,32.75,83.0,372.0,1850.0
2005,286.0,214.968531,325.932135,3.0,12.0,58.0,295.75,1760.0
2006,266.0,228.81203,291.924148,2.0,32.0,86.0,359.0,1680.0
2007,218.0,225.944954,294.209926,2.0,29.0,85.0,331.75,1610.0
2008,216.0,221.203704,305.615102,2.0,27.0,82.0,295.25,1530.0
2009,222.0,196.693694,259.794917,2.0,24.25,80.0,272.75,1450.0


In [60]:
full_pivoted = (
    pd.pivot_table(full_df.loc[full_df['Year']
                              .isin([2005, 2010, 2014])],
                   index=['Country'], 
                   columns=['Year'])
    .dropna()
)
full_pivoted

Unnamed: 0_level_0,GDP_Per_Capita,GDP_Per_Capita,GDP_Per_Capita,Life_Expectancy,Life_Expectancy,Life_Expectancy,Maternal_Death_Rate,Maternal_Death_Rate,Maternal_Death_Rate,Primary_ed_completion_percent,Primary_ed_completion_percent,Primary_ed_completion_percent,Secondary_ed_completion_percent,Secondary_ed_completion_percent,Secondary_ed_completion_percent
Year,2005,2010,2014,2005,2010,2014,2005,2010,2014,2005,2010,2014,2005,2010,2014
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Cambodia,2120.551877,2716.698409,3364.278729,63.088,66.56,68.273,351.0,248.0,189.0,58.94956,71.08916,72.409983,26.766637,36.631527,40.510197
Colombia,10083.314361,11823.387936,13899.368829,74.265,75.424,76.322,83.0,85.0,85.0,84.716367,90.637597,91.936667,62.29701,72.49546,74.603333
Costa Rica,13925.078819,16448.287478,18168.715242,78.117,78.769,79.398,33.0,32.0,29.0,90.626667,91.28,94.523333,50.82,61.356667,66.583333
Dominican Republic,10291.30888,12782.451899,14499.636116,70.771,72.046,73.003,83.0,96.0,94.0,81.63,85.83,91.521523,72.146667,79.366667,82.86837
Ecuador,9503.68919,10340.968231,12078.469844,74.112,75.089,75.923,94.0,78.0,65.0,92.176667,95.623333,97.666667,67.646667,79.01,86.983333
El Salvador,6870.622477,7328.713216,7990.444615,70.124,71.21,72.175,62.0,54.0,49.0,78.546667,82.353333,88.413333,59.93,65.21,72.483333
Honduras,4546.238807,4866.972176,5177.409416,72.026,73.317,74.278,77.0,74.0,68.0,72.78,80.9,82.546667,33.593333,46.083333,48.503333
Panama,16304.59881,21347.332749,27357.623072,75.892,76.792,77.583,88.0,79.0,60.0,91.823333,92.216753,94.13,73.543333,72.946797,77.626667
Paraguay,8568.54755,10405.230941,11612.389911,71.667,72.653,73.473,137.0,108.0,91.0,84.803333,87.186667,89.703333,64.026667,69.776667,76.743333
Peru,7519.02772,10066.469647,11877.084063,72.908,74.41,75.529,118.0,104.0,96.0,90.106667,93.433333,95.73,76.276667,82.703333,87.373333


In [102]:
(
    full_df
    .groupby('Year')
    [['Primary_ed_completion_percent', 'Secondary_ed_completion_percent', 'Maternal_Death_Rate']]
    .count()
)

Unnamed: 0_level_0,Primary_ed_completion_percent,Secondary_ed_completion_percent,Maternal_Death_Rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,0,0,170
2001,0,0,170
2002,0,0,170
2003,0,0,170
2004,63,63,212
2005,108,174,286
2006,144,144,266
2007,72,72,218
2008,69,69,216
2009,78,78,222


In [72]:
key_years = (
    full_df
    .loc[full_df['Year']
         .isin([2005, 2010, 2014])]
)

key_years['Year'] = key_years['Year'].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [83]:
key_years.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 842 entries, 5 to 4258
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          842 non-null    object 
 1   Year                             842 non-null    object 
 2   GDP_Per_Capita                   796 non-null    float64
 3   Life_Expectancy                  837 non-null    float64
 4   Continent                        836 non-null    object 
 5   Sex                              498 non-null    object 
 6   Primary_ed_completion_percent    345 non-null    float64
 7   Secondary_ed_completion_percent  498 non-null    float64
 8   Maternal_Death_Rate              842 non-null    float64
dtypes: float64(5), object(4)
memory usage: 65.8+ KB


In [84]:
key_years['Year'].value_counts()

2014    306
2005    286
2010    250
Name: Year, dtype: int64

In [73]:
df = key_years.loc[(key_years['Sex'] == '_T')]
fig = px.scatter(df,
                 x="Secondary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white")
fig.show()

In [74]:
df = key_years.loc[(key_years['Sex'] == 'F')]
fig = px.scatter(df,
                 x="Secondary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white")
fig.show()

In [75]:
df = key_years.loc[(key_years['Sex'] == 'M')]
fig = px.scatter(df,
                 x="Secondary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white")
fig.show()

In [76]:
df = key_years.loc[(key_years['Sex'] == '_T')]
fig = px.scatter(df,
                 x="Primary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white")
fig.show()

In [87]:
df = key_years.loc[(key_years['Sex'] == 'F')]
fig = px.scatter(df,
                 x="Primary_ed_completion_percent", 
                 y="Maternal_Death_Rate", 
                 color="Year", 
                 marginal_y="violin",
                 marginal_x="box", 
                 template="simple_white")
fig.show()

In [101]:
df = key_years[~key_years['Sex'].isna()
              & ~key_years['GDP_Per_Capita'].isna()]
fig =  px.scatter(df, 
                  x="Primary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Year",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  height=400,
                  hover_data=["Continent", "Country"])
    
fig.show()

In [116]:
ten_years = (
    full_df
    .loc[(full_df['Year'] >= 2005) & (full_df['Year']<=2014)]
)

ten_years.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2586 entries, 5 to 4258
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2586 non-null   object 
 1   Year                             2586 non-null   int64  
 2   GDP_Per_Capita                   2434 non-null   float64
 3   Life_Expectancy                  2572 non-null   float64
 4   Continent                        2564 non-null   object 
 5   Sex                              1329 non-null   object 
 6   Primary_ed_completion_percent    1008 non-null   float64
 7   Secondary_ed_completion_percent  1329 non-null   float64
 8   Maternal_Death_Rate              2586 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 202.0+ KB


In [132]:
df = ten_years.dropna(how='any', 
                      subset=['Maternal_Death_Rate',
                              'Primary_ed_completion_percent',
                              'Life_Expectancy',
                              'GDP_Per_Capita', 
                              'Sex'])
fig =  px.scatter(df, 
                  x="Primary_ed_completion_percent", 
                  y="Maternal_Death_Rate", 
                  color="Life_Expectancy",
                  size="GDP_Per_Capita",
                  facet_col="Sex",
                  facet_row="Year",
                  height=2500,
                  width=900,
                  labels={
                      "Primary_ed_completion_percent" : "% Completing Primary Ed",
                      "Maternal_Death_Rate" : "Maternal deaths/100K live births",
                      "Life_Expectancy" : "Life Expectancy (years)",
                      "GDP_Per_Capita" : "GDP per capita"
                  },
                  category_orders={
                      "Year" : [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
                  },
                  hover_data=["Continent", "Country"])

fig.update_traces(colorbar_orientation="h", selector=dict(type='heatmap'))
    
fig.show()