In [1]:
# Initial imports
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

## Loading and Preprocessing World Happiness Data

In [2]:
# Loading data
world_happiness_data_df = pd.read_csv("../Datasets_ML/World_Happiness_ML.csv")
world_happiness_data_df

Unnamed: 0,Country name,alpha-3,region,sub-region,year,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Life Ladder
0,Afghanistan,AFG,Asia,Southern Asia,2008,7.350416,0.450662,50.500000,0.718114,0.164055,0.881686,0.414297,0.258195,3.723590
1,Afghanistan,AFG,Asia,Southern Asia,2009,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,4.401778
2,Afghanistan,AFG,Asia,Southern Asia,2010,7.613900,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,4.758381
3,Afghanistan,AFG,Asia,Southern Asia,2011,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,3.831719
4,Afghanistan,AFG,Asia,Southern Asia,2012,7.660506,0.520637,51.700001,0.530935,0.234157,0.775620,0.613513,0.267919,3.782938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2019,7.697755,0.759162,53.099998,0.631908,-0.050874,0.830652,0.658434,0.235354,2.693523
2359,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2020,7.596050,0.717243,53.575001,0.643303,0.002848,0.788523,0.660658,0.345736,3.159802
2360,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2021,7.656878,0.685151,54.049999,0.667636,-0.079007,0.756945,0.609917,0.241682,3.154578
2361,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2022,7.670073,0.666172,54.525002,0.651987,-0.072935,0.752632,0.640609,0.191350,3.296220


# 1) Analysis without Geographical features

In [3]:
X = world_happiness_data_df.drop(columns=["Country name", "alpha-3", "region", "sub-region", "year", "Life Ladder"])
X.head()

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195
1,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092
2,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324
3,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175
4,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919


In [4]:
# Define target vector
y = world_happiness_data_df["Life Ladder"]
y.head()

0    3.723590
1    4.401778
2    4.758381
3    3.831719
4    3.782938
Name: Life Ladder, dtype: float64

In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

## Fitting the Random Forest Regressor

In [6]:
# Create the random forest regressor instance
rf_regressor = RandomForestRegressor(n_estimators=500, random_state=78)

In [7]:
# Fit the model
rf_regressor.fit(X_train, y_train)

## Making Predictions Using the Random Forest Regressor Model

In [8]:
# Making predictions using the testing data
predictions = rf_regressor.predict(X_test)

## Model Evaluation

In [9]:
# Regression evaluation metrics using scikit-learn

# Mean Absolute Error (MAE):
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE):
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE):
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# R-squared (R²):
r2 = r2_score(y_test, predictions)
print(f'R-squared (R²): {r2}')

Mean Absolute Error (MAE): 0.2948343398433853
Mean Squared Error (MSE): 0.15392384925733996
Root Mean Squared Error (RMSE): 0.39233130037933495
R-squared (R²): 0.8782121750688385


## Feature Importance

In [10]:
# Get the feature importance array
importances = rf_regressor.feature_importances_

In [11]:
# List the top 10 most important features
imp_features = sorted(zip(importances, X.columns), reverse=True)
top10 = imp_features[:10]
top10

[(0.5651989548891986, 'Log GDP per capita'),
 (0.1378442922393359, 'Positive affect'),
 (0.11582904534740077, 'Healthy life expectancy at birth'),
 (0.0686704994440647, 'Social support'),
 (0.031179006663997758, 'Freedom to make life choices'),
 (0.027376976422064887, 'Generosity'),
 (0.027302557997474053, 'Perceptions of corruption'),
 (0.02659866699646329, 'Negative affect')]

# 2) Same analysis WITH REGION

In [12]:
# Define features set
world_happiness_data_region_df = world_happiness_data_df.drop(columns=["Country name", "alpha-3", "sub-region", "year"])
world_happiness_data_region_df.head()

Unnamed: 0,region,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Life Ladder
0,Asia,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195,3.72359
1,Asia,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,4.401778
2,Asia,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,4.758381
3,Asia,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,3.831719
4,Asia,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919,3.782938


In [13]:
# Create dummie variables for region
region_dummies = pd.get_dummies(world_happiness_data_region_df["region"]).astype(int)
region_dummies.head()

Unnamed: 0,Africa,Americas,Asia,Europe,Oceania
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0


In [14]:
# Join the the region_dummies and the world_happiness_data_region_df
world_happiness_data_region_df = pd.concat([world_happiness_data_region_df, region_dummies], axis=1)

# Drop the region column and the target column to get the the features set
X1 = world_happiness_data_region_df.drop(columns=["region", "Life Ladder"])
X1.head()

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Africa,Americas,Asia,Europe,Oceania
0,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195,0,0,1,0,0
1,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,0,0,1,0,0
2,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,0,0,1,0,0
3,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,0,0,1,0,0
4,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919,0,0,1,0,0


In [15]:
y1 = world_happiness_data_region_df["Life Ladder"]
y1.head()

0    3.723590
1    4.401778
2    4.758381
3    3.831719
4    3.782938
Name: Life Ladder, dtype: float64

In [16]:
# Splitting into Train and Test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=78)

## Fitting the Random Forest Regressor

In [17]:
# Create the random forest regressor instance
rf_regressor1 = RandomForestRegressor(n_estimators=500, random_state=78)

In [18]:
# Fit the model
rf_regressor1.fit(X1_train, y1_train)

## Making Predictions Using the Random Forest Regressor Model

In [19]:
# Making predictions using the testing data
predictions1 = rf_regressor1.predict(X1_test)

## Model Evaluation

In [20]:
# Regression evaluation metrics using scikit-learn

# Mean Absolute Error (MAE):
mae1 = mean_absolute_error(y1_test, predictions1)
print(f'Region Mean Absolute Error (MAE): {mae1}')

# Mean Squared Error (MSE):
mse1 = mean_squared_error(y1_test, predictions1)
print(f'Region Mean Squared Error (MSE): {mse1}')

# Root Mean Squared Error (RMSE):
rmse1 = np.sqrt(mean_squared_error(y1_test, predictions1))
print(f'Region Root Mean Squared Error (RMSE): {rmse1}')

# R-squared (R²):
r2a = r2_score(y1_test, predictions1)
print(f'Region R-squared (R²): {r2a}')

Region Mean Absolute Error (MAE): 0.28861962039983013
Region Mean Squared Error (MSE): 0.14898104844360902
Region Root Mean Squared Error (RMSE): 0.3859806322130801
Region R-squared (R²): 0.8821230242522282


## Feature Importance

In [21]:
# Get the feature importance array
importances1 = rf_regressor1.feature_importances_

In [22]:
# List the top 10 most important features
imp_features1 = sorted(zip(importances1, X1.columns), reverse=True)
top10a = imp_features1[:10]
top10a

[(0.5640123820020433, 'Log GDP per capita'),
 (0.1175209002037702, 'Positive affect'),
 (0.10997602373379696, 'Healthy life expectancy at birth'),
 (0.06132876397168316, 'Social support'),
 (0.031259881208526054, 'Freedom to make life choices'),
 (0.030561132620884435, 'Americas'),
 (0.02710961383815463, 'Negative affect'),
 (0.025326895270166423, 'Perceptions of corruption'),
 (0.024379288389580717, 'Generosity'),
 (0.006251285951022005, 'Asia')]

# 3) Same analysis WITH SUB-REGION

In [23]:
# Define features set
world_happiness_data_subregion_df = world_happiness_data_df.drop(columns=["Country name", "alpha-3", "region", "year"])
world_happiness_data_subregion_df.head()

Unnamed: 0,sub-region,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Life Ladder
0,Southern Asia,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195,3.72359
1,Southern Asia,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,4.401778
2,Southern Asia,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,4.758381
3,Southern Asia,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,3.831719
4,Southern Asia,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919,3.782938


In [24]:
# Create dummie variables for sub-region
subregion_dummies = pd.get_dummies(world_happiness_data_subregion_df["sub-region"]).astype(int)
subregion_dummies.head()

Unnamed: 0,Australia and New Zealand,Central Asia,Eastern Asia,Eastern Europe,Latin America and the Caribbean,Northern Africa,Northern America,Northern Europe,South-eastern Asia,Southern Asia,Southern Europe,Sub-Saharan Africa,Western Asia,Western Europe
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [25]:
# Join the the region_dummies and the world_happiness_data_region_df
world_happiness_data_subregion_df = pd.concat([world_happiness_data_subregion_df, subregion_dummies], axis=1)

# Drop the region column and the target column to get the the features set
X2 = world_happiness_data_subregion_df.drop(columns=["sub-region", "Life Ladder"])
X2.head()

Unnamed: 0,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Australia and New Zealand,Central Asia,...,Latin America and the Caribbean,Northern Africa,Northern America,Northern Europe,South-eastern Asia,Southern Asia,Southern Europe,Sub-Saharan Africa,Western Asia,Western Europe
0,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195,0,0,...,0,0,0,0,0,1,0,0,0,0
1,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,0,0,...,0,0,0,0,0,1,0,0,0,0
2,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,0,0,...,0,0,0,0,0,1,0,0,0,0
3,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,0,0,...,0,0,0,0,0,1,0,0,0,0
4,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919,0,0,...,0,0,0,0,0,1,0,0,0,0


In [26]:
y2 = world_happiness_data_subregion_df["Life Ladder"]
y2.head()

0    3.723590
1    4.401778
2    4.758381
3    3.831719
4    3.782938
Name: Life Ladder, dtype: float64

In [27]:
# Splitting into Train and Test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=78)

## Fitting the Random Forest Regressor

In [28]:
# Create the random forest regressor instance
rf_regressor2 = RandomForestRegressor(n_estimators=500, random_state=78)

In [29]:
# Fit the model
rf_regressor2.fit(X2_train, y2_train)

## Making Predictions Using the Random Forest Regressor Model

In [30]:
# Making predictions using the testing data
predictions2 = rf_regressor2.predict(X2_test)

## Model Evaluation

In [31]:
# Regression evaluation metrics using scikit-learn

# Mean Absolute Error (MAE):
mae2 = mean_absolute_error(y2_test, predictions2)
print(f'Sub-Region Mean Absolute Error (MAE): {mae2}')

# Mean Squared Error (MSE):
mse2 = mean_squared_error(y2_test, predictions2)
print(f'Sub-Region Mean Squared Error (MSE): {mse2}')

# Root Mean Squared Error (RMSE):
rmse2 = np.sqrt(mean_squared_error(y2_test, predictions2))
print(f'Sub-Region Root Mean Squared Error (RMSE): {rmse2}')

# R-squared (R²):
r2b = r2_score(y2_test, predictions2)
print(f'Sub-Region R-squared (R²): {r2b}')

Sub-Region Mean Absolute Error (MAE): 0.2853071723315114
Sub-Region Mean Squared Error (MSE): 0.14633519720367172
Sub-Region Root Mean Squared Error (RMSE): 0.38253783761044047
Sub-Region R-squared (R²): 0.884216478055249


## Feature Importance

In [32]:
# Get the feature importance array
importances2 = rf_regressor2.feature_importances_

In [33]:
# List the top 10 most important features
imp_features2 = sorted(zip(importances2, X2.columns), reverse=True)
top10b = imp_features2[:10]
top10b

[(0.5640197566528156, 'Log GDP per capita'),
 (0.11755144636797753, 'Positive affect'),
 (0.10776411369028253, 'Healthy life expectancy at birth'),
 (0.061241134901209665, 'Social support'),
 (0.030368532768174267, 'Latin America and the Caribbean'),
 (0.029256417659358498, 'Freedom to make life choices'),
 (0.027188059808241873, 'Negative affect'),
 (0.024619769927191455, 'Perceptions of corruption'),
 (0.02342494292877963, 'Generosity'),
 (0.005749810318015004, 'Southern Asia')]