# UN Data Exploration for NSS Data Science - Abigail Ezell

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
gdp_df=pd.read_csv("../data/gdp_per_capita.csv")

In [24]:
gdp_df = (
    gdp_df
    .drop(columns='Value Footnotes')
    .rename(columns={'Country or Area':'Country','Value':'GDP_Per_Capita'})
)

In [25]:
gdp_df['Country'] = gdp_df['Country'].astype("string")

In [26]:
continents=pd.read_csv("../data/continents.csv")

In [27]:
gdp_df=gdp_df.merge(continents, how='inner', on='Country')

In [29]:
life_expectancy=pd.read_csv("../data/API_SP.DYN.LE00.IN_DS2_en_csv_v2_22997.csv",skiprows=4)

In [30]:
life_expectancy = (
    life_expectancy
    .drop(columns=['Country Code', 'Indicator Name','Indicator Code'])
    .melt(id_vars='Country Name')
    .rename(columns={'Country Name':'Country','variable':'Year','value':'Life_Expectancy'})
)

In [31]:
# I'll convert both 'Year' and 'Country' to string in both dataframes
gdp_df['Year'] = gdp_df['Year'].astype("string")
life_expectancy['Year'] = life_expectancy['Year'].astype("string")
# I'll convert Country to string in life_expectancy
life_expectancy['Country'] = life_expectancy['Country'].astype("string")

In [32]:
# Merge gdp_df and life_expectancy on country and year columns
gdp_le=gdp_df.merge(life_expectancy, on=['Country','Year'], how='inner')
# Merged dataframe was initially coming out as blank; checking to make sure there are no trailing or leading spaces
gdp_df['Country']=gdp_df['Country'].str.strip()
life_expectancy['Country']=life_expectancy['Country'].str.strip()
gdp_df['Year']=gdp_df['Year'].str.strip()
life_expectancy['Year']=life_expectancy['Year'].str.strip()
print(gdp_le)

          Country  Year  GDP_Per_Capita Continent  Life_Expectancy
0     Afghanistan  2023     1992.424394      Asia           66.035
1     Afghanistan  2022     1981.710168      Asia           65.617
2     Afghanistan  2021     2144.166570      Asia           60.417
3     Afghanistan  2020     2769.685745      Asia           61.454
4     Afghanistan  2019     2927.245144      Asia           62.941
...           ...   ...             ...       ...              ...
5851     Zimbabwe  1994     3965.730986    Africa           52.537
5852     Zimbabwe  1993     3634.750494    Africa           53.976
5853     Zimbabwe  1992     3649.891947    Africa           55.602
5854     Zimbabwe  1991     4126.405247    Africa           57.037
5855     Zimbabwe  1990     4013.299059    Africa           58.319

[5856 rows x 5 columns]


Let's compare the median life expectacy for each across all of the years of data that we have. Perform a groupby on both Year and Continent and then aggregate using the median and save the results to a new object.

a. What type of object results from this? \
b. Look at the index of the resulting object. What do you notice about it? \
c. Use .loc to select the median life expectancy for Asia in 2010. \
d. Use .loc to select the median life expectancy for both Asia and Africa in 2010. \
e. Use .loc to select the values for all continents for the year 2010. \
f. Use .loc to select the median life expectancy for Asia across all years. Hint: One way to do this is to use the swaplevels method.

In [41]:
gdp_le.dtypes

Country                    object
Year               string[python]
GDP_Per_Capita            float64
Continent                  object
Life_Expectancy           float64
dtype: object

In [50]:
gdp_le_med=gdp_le.groupby(['Year','Continent']).Life_Expectancy.agg('median')
gdp_le_med

Year  Continent    
1990  Africa           53.684000
      Asia             65.736500
      Europe           74.370671
      North America    70.471000
      Oceania          65.056000
                         ...    
2023  Asia             74.537000
      Europe           80.541463
      North America    74.333500
      Oceania          68.292500
      South America    77.392000
Name: Life_Expectancy, Length: 204, dtype: float64

In [51]:
type(gdp_le_med)

pandas.core.series.Series

In [52]:
gdp_le_med.index

MultiIndex([('1990',        'Africa'),
            ('1990',          'Asia'),
            ('1990',        'Europe'),
            ('1990', 'North America'),
            ('1990',       'Oceania'),
            ('1990', 'South America'),
            ('1991',        'Africa'),
            ('1991',          'Asia'),
            ('1991',        'Europe'),
            ('1991', 'North America'),
            ...
            ('2022',        'Europe'),
            ('2022', 'North America'),
            ('2022',       'Oceania'),
            ('2022', 'South America'),
            ('2023',        'Africa'),
            ('2023',          'Asia'),
            ('2023',        'Europe'),
            ('2023', 'North America'),
            ('2023',       'Oceania'),
            ('2023', 'South America')],
           names=['Year', 'Continent'], length=204)

*The resulting index is a hierarchical index. The columns we grouped by ended up as the indices of the resulting series.*