# Data Cleaning

In [1]:
# Dependencies and reading in the files
import pandas as pd
from pathlib import Path

HDI_raw = Path('resources/HDI_dataset.csv')
Happiness_raw = Path('resources/World Happiness Index by Reports 2013-2023.csv')

HDI_data = pd.read_csv(HDI_raw)
Happiness_data = pd.read_csv(Happiness_raw)
Happiness_data = Happiness_data[['Country', 'Year', 'Index', 'Rank']].copy()
Happiness_data.head()


Unnamed: 0,Country,Year,Index,Rank
0,Afghanistan,2013,4.04,143.0
1,Afghanistan,2015,3.575,153.0
2,Afghanistan,2016,3.36,154.0
3,Afghanistan,2017,3.794,141.0
4,Afghanistan,2018,3.632,145.0


## HDI and Happiness Merge

In [2]:
# HDI cleaning for 2021
HDI_2021 = HDI_data[['Country', 'HDI Rank (2021)', 'Human Development Index (2021)', 'Life Expectancy at Birth (2021)', 
                     'Expected Years of Schooling (2021)', 'Mean Years of Schooling (2021)',
                     'Gross National Income Per Capita (2021)']].copy()


# Happiness cleaning for 2021
happiness_2021 = Happiness_data.loc[Happiness_data['Year'] == 2021]

# merge two dataframes

HDI_Happiness_merge = pd.merge(HDI_2021, happiness_2021, how="left", on=["Country"])
HDI_Happiness_merge.rename(columns={'Year': 'Happiness Year','Index':'Happiness Index', 'Rank': 'Happiness Rank'}, inplace=True)
HDI_Happiness_merge



Unnamed: 0,Country,HDI Rank (2021),Human Development Index (2021),Life Expectancy at Birth (2021),Expected Years of Schooling (2021),Mean Years of Schooling (2021),Gross National Income Per Capita (2021),Happiness Year,Happiness Index,Happiness Rank
0,Afghanistan,180.0,0.478,61.9824,10.263844,2.985070,1824.190915,2021.0,2.523,149.0
1,Angola,148.0,0.586,61.6434,12.172100,5.417391,5465.617791,2021.0,,
2,Albania,67.0,0.796,76.4626,14.448000,11.286455,14131.110390,2021.0,5.117,93.0
3,Andorra,40.0,0.858,80.3684,13.300239,10.555120,51166.626610,,,
4,United Arab Emirates,26.0,0.911,78.7104,15.717690,12.694030,62573.591810,2021.0,6.561,25.0
...,...,...,...,...,...,...,...,...,...,...
190,Samoa,111.0,0.707,72.7675,12.418859,11.403800,5307.953374,,,
191,Yemen,183.0,0.455,63.7534,9.098710,3.200000,1314.270189,2021.0,3.658,141.0
192,South Africa,109.0,0.713,62.3410,13.643710,11.373160,12948.373250,2021.0,4.956,103.0
193,Zambia,154.0,0.565,61.2234,10.928760,7.187091,3217.767739,2021.0,4.073,137.0


In [3]:
# clean merged DF
HDI_Happiness_merge.dropna(inplace=True)
HDI_Happiness_merge.reset_index()
HDI_Happiness_merge


Unnamed: 0,Country,HDI Rank (2021),Human Development Index (2021),Life Expectancy at Birth (2021),Expected Years of Schooling (2021),Mean Years of Schooling (2021),Gross National Income Per Capita (2021),Happiness Year,Happiness Index,Happiness Rank
0,Afghanistan,180.0,0.478,61.9824,10.263844,2.985070,1824.190915,2021.0,2.523,149.0
2,Albania,67.0,0.796,76.4626,14.448000,11.286455,14131.110390,2021.0,5.117,93.0
4,United Arab Emirates,26.0,0.911,78.7104,15.717690,12.694030,62573.591810,2021.0,6.561,25.0
5,Argentina,47.0,0.842,75.3899,17.874870,11.147269,20925.268140,2021.0,5.929,57.0
6,Armenia,85.0,0.759,72.0431,13.116760,11.330300,13157.993900,2021.0,5.283,86.0
...,...,...,...,...,...,...,...,...,...,...
187,Venezuela,120.0,0.691,70.5536,12.816080,11.107277,4810.882621,2021.0,4.892,107.0
191,Yemen,183.0,0.455,63.7534,9.098710,3.200000,1314.270189,2021.0,3.658,141.0
192,South Africa,109.0,0.713,62.3410,13.643710,11.373160,12948.373250,2021.0,4.956,103.0
193,Zambia,154.0,0.565,61.2234,10.928760,7.187091,3217.767739,2021.0,4.073,137.0


In [4]:
# Save merge and Happiness as CSVs
HDI_Happiness_merge.to_csv('resources/HDI_Happiness_2021.csv')
happiness_2021.to_csv('resources/Happiness_2021.csv')

## HDI 2011-2021 Data Cleaning

In [7]:
# Keep only Human Development Index from 2011-2021
HDI_range = HDI_data[['Country', 'Human Development Index (2011)', 'Human Development Index (2012)',
                      'Human Development Index (2013)', 'Human Development Index (2014)', 
                      'Human Development Index (2015)', 'Human Development Index (2016)',
                      'Human Development Index (2017)', 'Human Development Index (2018)',
                      'Human Development Index (2019)', 'Human Development Index (2020)',
                      'Human Development Index (2021)']].copy()
HDI_range.head()
# Save as CSV
HDI_range.to_csv('resources/HDI_range.csv', index=False)