# Analysis of the UN's World Happiness Index with machine learning

Maaike de Jong  
June 2020  
  
See the repository's [README](https://github.com/maaikedj/happiness-machine-learning/blob/master/README.md) file for background and details on the analysis and data.  

### Notebook 2: data exploration and cleaning
In this notebook I explore, clean and transform the data for further analysis. 
  

In [None]:
# import packages

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import data resulting from previous notebook

df = pd.read_csv('dfML.csv')
df.head()

In [None]:
# check whether the columns are the right data type

df.info()

In [None]:
# Check for missing values

df.isnull().sum()

In [None]:
# check whether variables are correlated with heatmap

corr = df.corr()

plt.figure(figsize=(15,12))
sns.heatmap(corr)

In [None]:
# remove columns that have high correlations with other columns or are redundant:

# Country
# Region
# Access to electricity (% of population)
# Refugee population by country or territory of origin
# Population, total

df2 = df.drop(['Country', 'Region', 'Access to electricity (% of population)', 'Refugee population by country or territory of origin', 'Population, total'], axis=1)
df2.head()

In [None]:
# drop rows with missing values

df3 = df2.dropna()

In [None]:
# check missing values again

df3.isnull().sum()

In [None]:
# rename remaining columns for easier use and plotting

df_clean = df3.rename(columns = {'Score mean': 'Happiness score',
                            'CO2 emissions (metric tons per capita)': 'CO2 emission per capita',
                            'Compulsory education, duration (years)': 'Compulsory education (years)',
                            'GDP per capita (current US$)': 'GDP per capita',
                            'Individuals using the Internet (% of population)': 'Internet use (% of population)',
                            'Land area (sq. km)': 'Land area',
                            'Life expectancy at birth, total (years)': 'Life expectancy',
                            'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)': 'Air pollution',
                            'People using at least basic drinking water services (% of population)': 'Drinking water services',
                            'Population density (people per sq. km of land area)': 'Population density',
                            'Population growth (annual %)': 'Population growth',
                            'Proportion of seats held by women in national parliaments (%)': 'Women in parliament %',
                            'Renewable energy consumption (% of total final energy consumption)': 'Renewable energy %',
                            'School enrollment, primary (gross), gender parity index (GPI)': 'Gender parity index (GPI)',
                            'Terrestrial protected areas (% of total land area)': 'Protected land %',
                            'Urban population (% of total population)': 'Urban population',
                            'Refugees country of origin (% of total population': 'Refugees %'})

In [None]:
list(df_clean.columns)

In [None]:
# save clean data frame for future use:

df_clean.to_csv('dfML_clean.csv', index = False)

### Visual data exploration

Visualise distribution of variables with histogram and boxplot, and their relation with Happiness score with scatterplot

In [None]:
# define function for figures

def figure(col_name):
    
    x = df_clean['Happiness score']

    fig, [ax1, ax2, ax3] = plt.subplots(1,3, figsize=(15,4))

    ax1.boxplot(df_clean[col_name])
    
    ax2.hist(df_clean[col_name])

    ax3.scatter(x, df_clean[col_name])
    
    fig.suptitle(col_name, fontsize=16)

    return plt.show()

In [None]:
# define function for figures with log scale for the variable (y)

def figure_log(col_name):
    
    x = df_clean['Happiness score']

    fig, [ax1, ax2, ax3] = plt.subplots(1,3, figsize=(15,4))

    ax1.boxplot(df_clean[col_name])
    ax1.set_yscale('log') 
    
    ax2.hist(df_clean[col_name])
    ax2.set_yscale('log') 

    ax3.scatter(x, df_clean[col_name])
    ax3.set_yscale('log') 
    
    fig.suptitle(col_name + ' log scale', fontsize=16)

    return plt.show()

In [None]:
# df summary stats

df_clean.describe()

In [None]:
figure('CO2 emission per capita')

# data very skewed

In [None]:
# on log scale

figure_log('CO2 emission per capita')

# improved but not great

In [None]:
figure('Compulsory education (years)')

In [None]:
figure('GDP growth (annual %)')

In [None]:
figure('GDP per capita')

# very skewed

In [None]:
# on a log scale

figure_log('GDP per capita')

In [None]:
figure('Internet use (% of population)')

In [None]:
figure('Land area')

In [None]:
# on a log scale

figure_log('Land area')

In [None]:
figure('Life expectancy')

In [None]:
figure('Air pollution')

In [None]:
# on a log scale

figure_log('Air pollution')

In [None]:
figure('Drinking water services')

In [None]:
# on a log scale

figure_log('Drinking water services')

In [None]:
figure('Population density')

In [None]:
# on a log scale

figure_log('Population density')

In [None]:
figure('Population growth')

In [None]:
figure('Women in parliament %')

In [None]:
figure('Renewable energy %')

In [None]:
figure('Gender parity index (GPI)')

In [None]:
# on a log scale

figure_log('Gender parity index (GPI)')

In [None]:
figure('Protected land %')

In [None]:
figure('Urban population')

In [None]:
figure('Refugees %')

In [None]:
# on a log scale

figure_log('Refugees %')

# plot issue due to lots of zeroes on a log scale

In [None]:
# make df with several variables log transformed
# first make copy of df_clean
df_clean_tr = df_clean.copy()

In [None]:
# perform log transformation on selected variables:

log_columns = ['CO2 emission per capita', 'GDP per capita', 'Land area', 'Air pollution', 'Population density', 'Drinking water services', 'Population density', 'Gender parity index (GPI)', 'Refugees %']

for col in log_columns:
    df_clean_tr[col] = np.log10(df_clean_tr[col])

In [None]:
# rename log transformed columnn

for col in log_columns:
    df_clean_tr.rename(columns = {col: col + ' (log)'}, inplace = True)

In [None]:
df_clean_tr.head()

In [None]:
# save clean data with transformed variables for future use:

df_clean_tr.to_csv('dfML_clean_tr.csv', index = False)