# Analysis of the UN's World Happiness Index with machine learning

Maaike de Jong  
June 2020  
  
See the repository's [README](https://github.com/maaikedj/happiness-machine-learning/blob/master/README.md) file for background and details on the analysis and data.  

### Notebook 2: data exploration and cleaning
In this notebook I explore, clean and transform the data for further analysis. 
  

In [None]:
# import packages

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import data resulting from previous notebook

df = pd.read_csv('dfML.csv')
df.head()

In [None]:
# check whether the columns are the right data type

df.info()


In [None]:
# Check for missing values

df.isnull().sum()

In [None]:
# check whether variables are correlated with correlation matrix

corr = df.corr()
corr

In [None]:
# visualise correlations with heatmap


plt.figure(figsize=(15,12))
sns.heatmap(corr)

In [None]:
# remove columns that have high correlations with other columns or are redundant:

# Country
# Region
# Access to electricity (% of population)
# Refugee population by country or territory of origin
# Population, total

df2 = df.drop(['Country', 'Region', 'Access to electricity (% of population)', 'Refugee population by country or territory of origin', 'Population, total'], axis=1)
df2.head()

In [None]:
# check for rows with missing values

df2[df2.isnull().any(axis=1)]

In [None]:
# drop rows with missing values

df_clean = df2.dropna()

In [None]:
# check missing values again

df_clean.isnull().sum()

In [None]:
# save clean data frame for future use:

df_clean.to_csv('dfML_clean.csv', index = False)

In [None]:
# visualize distribution of variables to check whether transformations are necessary

df_clean[['CO2 emissions (metric tons per capita)']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['CO2 emissions (metric tons per capita)']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
df_clean[['GDP per capita (current US$)']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['GDP per capita (current US$)']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
df_clean[['Individuals using the Internet (% of population)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Land area (sq. km)']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['Land area (sq. km)']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
df_clean[['Life expectancy at birth, total (years)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
df_clean[['People using at least basic drinking water services (% of population)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Population density (people per sq. km of land area)']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['Population density (people per sq. km of land area)']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
df_clean[['Population growth (annual %)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Primary education, duration (years)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Proportion of seats held by women in national parliaments (%)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Renewable energy consumption (% of total final energy consumption)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Secondary education, duration (years)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Terrestrial protected areas (% of total land area)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Urban population (% of total population)']].boxplot(figsize = (6, 4))

In [None]:
df_clean[['Refugees country of origin (% of total population']].boxplot(figsize = (6, 4))

In [None]:
# improved with log transformation?

np.log10(df_clean[['Refugees country of origin (% of total population']]).boxplot(figsize = (6, 4))

# Yes, use log transformation

In [None]:
# make copy of df_clean
df_clean_tr = df_clean.copy()

In [None]:
# perform log transformation on selected variables:

log_columns = ['CO2 emissions (metric tons per capita)', 'GDP per capita (current US$)', 'Land area (sq. km)', 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)', 'Population density (people per sq. km of land area)', 'Refugees country of origin (% of total population']

for col in log_columns:
    df_clean_tr[col] = np.log10(df_clean_tr[col])

In [None]:
# rename log transformed columnn

for col in log_columns:
    df_clean_tr.rename(columns = {col: col + ' log'}, inplace = True)

In [None]:
df_clean_tr.head()

In [None]:
# save clean data with transformed variables for future use:

df_clean_tr.to_csv('dfML_clean_tr.csv', index = False)