# Analysis of the UN's World Happiness Index with machine learning

Maaike de Jong  
June 2020  
  
See the repository's [README](https://github.com/maaikedj/happiness-machine-learning/blob/master/README.md) file for background and details on the analysis and data.  

### Notebook 4: Data visualization with graphs and maps
In this notebook I visualize the relationship between the World Happiness Index with the World Bank's World Development Indicators.
  

In [None]:
# import packages

import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import seaborn as sns

import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from shapely import wkt

In [None]:
# import data file with log transformations for some of the variables (see end of notebook 2)

df = pd.read_csv('dfML_clean_tr.csv')
df.head()

### Regression plots

Figures to visualise the regression between happiness score and variables

In [None]:
# define function for figure

def regplot(df, col_name):
    sns.set()
    sns.set_style('white')
    sns.set_color_codes('pastel')

    f, ax = plt.subplots(figsize=(8, 6))

    sns.regplot(x = 'Happiness score', y = col_name, color = 'royalblue', data = df)

    ax.tick_params(axis='both', which='major', labelsize=14) 
    ax.tick_params(axis='both', which='minor', labelsize=14)
    ax.set_xticks(range(3, 9, 1))

    plt.xlabel('Happiness score', fontsize=16)
    plt.ylabel(col_name, fontsize=16)
    plt.suptitle('Regression between Happiness score and ' + col_name, fontsize=16)

    sns.despine()

    return plt.show()

In [None]:
# plot three top variables 

regplot(df, 'GDP per capita (log)')

In [None]:
regplot(df, 'Life expectancy')

In [None]:
regplot(df, 'Refugees % (log)')

### Creating a world map of World Happiness Score

And compare with variable scores on world map

In [None]:
# import world map geodataframe. The lowres worldmap dataset from Natural Earth can be imported directly from geopandas

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.rename(columns = {'name': 'Country'}, inplace = True)
world.head()

In [None]:
world.shape

In [None]:
# import merged dataframe from end of notebook 1

df2 = pd.read_csv('dfML.csv')

In [None]:
df2.shape

In [None]:
# Will merge this with main df on country. First check whether country names are the samelist15 = hap15_s1['Country'].values.tolist()
list1 = df2['Country'].values.tolist()

list2 = world['Country'].values.tolist()

In [None]:
sorted(list(set(list1) - set(list2)))

In [None]:
sorted(list(set(list2) - set(list1)))

In [None]:
# replace names

world2 = world.replace({'Country': {'Bosnia and Herz.': 'Bosnia and Herzegovina', 'Central African Rep.': 'Central African Republic', 'Congo': 'Congo (Brazzaville)', 'Dem. Rep. Congo': 'Congo (Kinshasa)', 'Czechia': 'Czech Republic', 'Dominican Rep.': 'Dominican Republic', "Côte d'Ivoire": 'Ivory Coast', 'Palestine': 'Palestinian Territories', 'S. Sudan': 'South Sudan', 'eSwatini': 'Swaziland', 'United States of America': 'United States'}})


In [None]:
# create new world df without Antarctica: 

world3 = world2[world2['Country'] != 'Antarctica']

In [None]:
# join this df with the main df

dfgeo = pd.merge(world3, df2, on = 'Country', how = 'left')

In [None]:
gdf = gpd.GeoDataFrame(dfgeo, geometry = 'geometry')
gdf.head()

In [None]:
type(gdf['geometry'])

In [None]:
# Create choropleth maps of happiness score and other variables per country

# first check whether the country map data is complete
plt.rcParams['figure.figsize'] = [16,8]
gdf.plot(facecolor = 'silver', edgecolor = 'grey');

In [None]:
# Fill happiness score NaNs with zeroes
gdf['Score mean'] = gdf['Score mean'].fillna(value = 0)

In [None]:
# plot the map with happiness scores

plt.rcParams['figure.figsize'] = [20, 10]
fig, ax = plt.subplots(1, 1)

gdf.plot(column='Score mean', legend=True, ax = ax, cmap='summer_r', scheme='user_defined', classification_kwds = {'bins':[3, 4, 5, 6, 7, 8]})

cmap = cm.get_cmap('summer_r')
patch1 = mpatches.Patch(color=cmap(0.0), label = 'No data')
patch2 = mpatches.Patch(color=cmap(0.2), label = '3 - 4')
patch3 = mpatches.Patch(color=cmap(0.4), label = '4 - 5')
patch4 = mpatches.Patch(color=cmap(0.6), label = '5 - 6')
patch5 = mpatches.Patch(color=cmap(0.8), label = '6 - 7')
patch6 = mpatches.Patch(color=cmap(1.0), label = '7 - 8')

plt.legend(handles = [patch1, patch2, patch3, patch4, patch5, patch6], prop = {'size':16}, loc = 'lower left', frameon = False)

plt.show()


In [None]:
ax.set_axis_off()
ax.get_figure()

In [None]:
# investigate GDP per capita

gdf['GDP per capita (current US$)'].describe()


In [None]:
# Fill GDP NaNs with zeroes
gdf['GDP per capita (current US$)'] = gdf['GDP per capita (current US$)'].fillna(value = 0)

In [None]:
# plot map with GDP

plt.rcParams['figure.figsize'] = [20, 10]
fig, ax = plt.subplots(1, 1)

gdf.plot(column='GDP per capita (current US$)', legend=True, ax = ax, cmap='summer_r', scheme='user_defined', classification_kwds = {'bins':[100, 1000, 5000, 10000, 50000, 200000]})

cmap = cm.get_cmap('summer_r')
patch1 = mpatches.Patch(color=cmap(0.0), label = 'No data')
patch2 = mpatches.Patch(color=cmap(0.2), label = '100 - 1k')
patch3 = mpatches.Patch(color=cmap(0.4), label = '1k - 5k')
patch4 = mpatches.Patch(color=cmap(0.6), label = '5k - 10k')
patch5 = mpatches.Patch(color=cmap(0.8), label = '10k - 50k')
patch6 = mpatches.Patch(color=cmap(1.0), label = '50k - 100k')

plt.legend(handles = [patch1, patch2, patch3, patch4, patch5, patch6], prop = {'size':16}, loc = 'lower left', frameon = False)

plt.show()

In [None]:
ax.set_axis_off()
ax.get_figure()