# Analysis of the UN's World Happiness Index with machine learning  
Maaike de Jong  
June 2020  
  
See the repository's [README](https://github.com/maaikedj/happiness-machine-learning/blob/master/README.md) file for background and details on the analysis and data.  

### Notebook 1: combining data files

In this notebook I combine data on the World Happiness Index 2015-2019 with data on World Development Indicators from the World Bank.


In [None]:
# import packages

import numpy as np
import pandas as pd

In [None]:
# import happiness index files of the past 5 years

hap15 = pd.read_csv('../data/2015.csv')
hap15.head()

In [None]:
hap16 = pd.read_csv('../data/2016.csv')
hap16.head()

In [None]:
hap17 = pd.read_csv('../data/2017.csv')
hap17.head()

In [None]:
hap18 = pd.read_csv('../data/2018.csv')
hap18.head()

In [None]:
hap19 = pd.read_csv('../data/2019.csv')
hap19.head()

In [None]:
# Select only country/ Region, happiness rank and happiness score
# Dealing with inconsistencies in column names

hap15_select = hap15[['Country', 'Region', 'Happiness Rank', 'Happiness Score']]
hap16_select = hap16[['Country', 'Happiness Rank', 'Happiness Score']]
hap17_select = hap17[['Country', 'Happiness.Rank', 'Happiness.Score']]
hap18_select = hap18[['Country or region', 'Overall rank', 'Score']]
hap19_select = hap19[['Country or region', 'Overall rank', 'Score']]

In [None]:
hap15_s1 = hap15_select.rename(columns={'Happiness Rank': 'Happiness Rank 2015', 'Happiness Score': 'Happiness Score 2015'})
hap15_s1.head()

In [None]:
hap16_s1 = hap16_select.rename(columns={'Happiness Rank': 'Happiness Rank 2016', 'Happiness Score': 'Happiness Score 2016'})
hap16_s1.head()

In [None]:
hap17_s1 = hap17_select.rename(columns={'Happiness.Rank': 'Happiness Rank 2017', 'Happiness.Score': 'Happiness Score 2017'})
hap17_s1.head()

In [None]:
hap18_s1 = hap18_select.rename(columns={'Country or region': 'Country', 'Overall rank': 'Happiness Rank 2018', 'Score': 'Happiness Score 2018'})
hap18_s1.head()

In [None]:
hap19_s1 = hap19_select.rename(columns={'Country or region': 'Country', 'Overall rank': 'Happiness Rank 2019', 'Score': 'Happiness Score 2019'})
hap19_s1.head()

In [None]:
# before merging on country, check whether countries are written the same

list15 = hap15_s1['Country'].values.tolist()
list16 = hap16_s1['Country'].values.tolist()

In [None]:
(list(set(list15) - set(list16)))

In [None]:
(list(set(list16) - set(list15)))

# Somaliland region occurs in both lists but written slightly different, needs to be changed

In [None]:
hap16_s2 = hap16_s1.replace({'Country': {'Somaliland Region': 'Somaliland region'}})

In [None]:
# merge 2016 df on 2015 df

df = pd.merge(hap15_s1, hap16_s2, on = 'Country', how = 'outer')

In [None]:
# before merging 2017 df on country, again check whether countries are written the same

df_list = df['Country'].values.tolist()
list17 = hap17_s1['Country'].values.tolist()

In [None]:
(list(set(df_list) - set(list17)))

In [None]:
(list(set(list17) - set(df_list)))

# in the 2017 df the names for Hong Kong and Taiwan need to be changed

In [None]:
hap17_s2 = hap17_s1.replace({'Country': {'Hong Kong S.A.R., China': 'Hong Kong', 'Taiwan Province of China': 'Taiwan'}})

In [None]:
# merge dataframes

df2 = pd.merge(df, hap17_s2, on = 'Country', how = 'outer')

In [None]:
# again before merging 2018, check whether countries are written the same

df2_list = df2['Country'].values.tolist()
list18 = hap18_s1['Country'].values.tolist()

In [None]:
(list(set(df2_list) - set(list18)))

In [None]:
(list(set(list18) - set(df2_list)))

In [None]:
# fix country names

hap18_s2 = hap18_s1.replace({'Country': {'Trinidad & Tobago': 'Trinidad and Tobago', 'Northern Cyprus': 'North Cyprus'}})

In [None]:
# merge dataframes

df3 = pd.merge(df2, hap18_s2, on = 'Country', how = 'outer')

In [None]:
# again before merging 2019, check whether countries are written the same

df3_list = df3['Country'].values.tolist()
list19 = hap19_s1['Country'].values.tolist()

In [None]:
(list(set(df3_list) - set(list19)))

In [None]:
(list(set(list19) - set(df3_list)))

In [None]:
# fix country names

hap19_s2 = hap19_s1.replace({'Country': {'Trinidad & Tobago': 'Trinidad and Tobago', 'Northern Cyprus': 'North Cyprus', 'North Macedonia': 'Macedonia'}})

In [None]:
# merge dataframes

df4 = pd.merge(df3, hap19_s2, on = 'Country', how = 'outer')

In [None]:
# check complete df with years 2015-2019

df4.head(30)

In [None]:
# make new column with average rank over the years

col = df4[['Happiness Rank 2015', 'Happiness Rank 2016', 'Happiness Rank 2017', 'Happiness Rank 2018', 'Happiness Rank 2019']]

df4['Rank mean'] = col.mean(axis = 1, skipna = True)
df4.head()

In [None]:
# make new rank column based on mean rank

df4['Rank overall'] = df4['Rank mean'].rank(method='first', ascending=True)

In [None]:
# make new column with average happiness score over 5 years

col2 = df4[['Happiness Score 2015', 'Happiness Score 2016', 'Happiness Score 2017', 'Happiness Score 2018', 'Happiness Score 2019']]

df4['Score mean'] = col2.mean(axis = 1, skipna = True)
df4.head()

In [None]:
# sort on score mean

df4_sorted = df4.sort_values('Score mean', ascending = False)
df4_sorted.head(10)

In [None]:
# Import World Bank data with World Development Indicator values per country 1960-2019

WDI = pd.read_csv('../data/WDIData.csv')
WDI.head()

In [None]:
# select relevant columns, years 2010-2019

WDI2 = WDI[['Country Name', 'Country Code', 'Indicator Name', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

In [None]:
# create new column with average values over 2010-2019

col3 = WDI2[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

WDI2['2010_2019'] = col3.mean(axis = 1, skipna = True)

WDI2.head()

In [None]:
# select relevant columns

WDI3 = WDI2[['Country Name', 'Country Code', 'Indicator Name', '2010_2019']]
WDI3.head()

In [None]:
# transpose dataframe so that each country has one row and the indicators each have a column

WDI3_pivot = WDI3.pivot_table(index='Country Name', columns='Indicator Name', values='2010_2019', aggfunc = 'max').reset_index()

In [None]:
WDI3_pivot.head()
# there are 1417 columns, too many to review 

In [None]:
# only select columns with less than 10% missing data

WDI_select = WDI3_pivot.loc[:, WDI3_pivot.isnull().mean() < .10]

In [None]:
# list the remaining indicators

WDI_column_list = WDI_select.columns.tolist()

WDI_column_list

In [None]:
# select 20 relevant indicators:

WDI_df = WDI_select[['Country Name', 'Access to electricity (% of population)',
                    'CO2 emissions (metric tons per capita)',
                    'Compulsory education, duration (years)',
                    'GDP growth (annual %)',
                    'GDP per capita (current US$)',
                    'Individuals using the Internet (% of population)',
                    'Land area (sq. km)',
                    'Life expectancy at birth, total (years)',
                    'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
                    'People using at least basic drinking water services (% of population)',
                    'Population density (people per sq. km of land area)',
                    'Population growth (annual %)',
                    'Proportion of seats held by women in national parliaments (%)',
                    'Refugee population by country or territory of origin',
                    'Population, total',
                    'Renewable energy consumption (% of total final energy consumption)',
                    'School enrollment, primary (gross), gender parity index (GPI)',
                    'Terrestrial protected areas (% of total land area)',
                    'Urban population (% of total population)']]

In [None]:
WDI_df.head()

In [None]:
# Merge happiness index df with WDI df
# First, change column name to 'Country'

WDI_df.rename(columns = {'Country Name': 'Country'}, inplace = True)
WDI_df.head()


In [None]:
# Check whether country names need to be renamed

df4_list = df4['Country'].values.tolist()

WDI_list = WDI_df['Country'].values.tolist()

In [None]:
sorted((list(set(df4_list) - set(WDI_list))))

In [None]:
sorted(list(set(WDI_list) - set(df4_list)))

In [None]:
# Change country names that are in both lists so that they are the same

WDI_df2 = WDI_df.replace({'Country': {'Congo, Rep.': 'Congo (Brazzaville)', 'Congo, Dem. Rep.': 'Congo (Kinshasa)', 'Egypt, Arab Rep.': 'Egypt', 'Gambia, The': 'Gambia', 'Hong Kong SAR, China': 'Hong Kong', 'Iran, Islamic Rep.': 'Iran', "Cote d'Ivoire": 'Ivory Coast', 'Kyrgyz Republic': 'Kyrgyzstan', 'Lao PDR': 'Laos', 'North Macedonia': 'Macedonia', 'West Bank and Gaza': 'Palestinian Territories', 'Russian Federation': 'Russia', 'Slovak Republic': 'Slovakia', 'Korea, Rep.': 'South Korea', 'Eswatini': 'Swaziland', 'Syrian Arab Republic': 'Syria', 'Venezuela, RB': 'Venezuela', 'Yemen, Rep.': 'Yemen'}})


In [None]:
# check

WDI_list2 = WDI_df2['Country'].values.tolist()
sorted((list(set(df4_list) - set(WDI_list2))))

# remove 'North Cyprus', 'Somaliland region', 'Taiwan' after merge (no WDI data)

In [None]:
# Left merge WDI df on happiness index df on country

df5 = pd.merge(df4, WDI_df2, on = 'Country', how = 'left')

In [None]:
df5.head()

In [None]:
print(df4.shape)
print(df5.shape)

In [None]:
# remove 'North Cyprus', 'Somaliland region', 'Taiwan'

df5[df5['Country'] == 'North Cyprus']

In [None]:
df5[df5['Country'] == 'Somaliland region']

In [None]:
df5[df5['Country'] == 'Taiwan']

In [None]:
# drop rows

df6 = df5.drop([df5.index[37], df5.index[65], df5.index[90]])

In [None]:
df6 = df6.reset_index(drop = True)

In [None]:
# check number of rows

df6.shape

In [None]:
# Check missing data

df6.isnull().sum()

In [None]:
# fix missing regions

df6[df6['Region'].isnull()]

In [None]:
# check region names

df6['Region'].value_counts()

In [None]:
# fill in regions

df6.loc[155, 'Region'] = 'Latin America and Caribbean'
df6.loc[156, 'Region'] = 'Latin America and Caribbean'
df6.loc[157, 'Region'] = 'Sub-Saharan Africa'
df6.loc[158, 'Region'] = 'Sub-Saharan Africa'
df6.loc[159, 'Region'] = 'Sub-Saharan Africa'
df6.loc[160, 'Region'] = 'Sub-Saharan Africa'


In [None]:
# make new df for ML analysis with just the country, region, average happiness score and the selected WDIs

dfML = df6.drop(['Happiness Rank 2015', 'Happiness Score 2015', 'Happiness Rank 2016', 'Happiness Score 2016', 'Happiness Rank 2017', 'Happiness Score 2017', 'Happiness Rank 2018', 'Happiness Score 2018', 'Happiness Rank 2019', 'Happiness Score 2019', 'Rank mean', 'Rank overall', 'Compulsory education, duration (years)', 'GDP growth (annual %)', 'School enrollment, primary (gross), gender parity index (GPI)'], axis=1)
dfML.head()

In [None]:
# make new column for refugees, percentage of total population (existing column gives absolute numbers)

dfML['Refugees country of origin (% of total population'] = dfML['Refugee population by country or territory of origin'] / dfML['Population, total']
dfML.head()

In [None]:
# save df for future use

dfML.to_csv('dfML.csv', index=False)