In [14]:
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm

from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn import linear_model

#### import the data

In [4]:
happiness_df = pd.read_csv("../World Happiness Report.csv")
happiness_df.head()

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
0,Afghanistan,South Asia,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195,0.612072
1,Afghanistan,South Asia,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092,0.611545
2,Afghanistan,South Asia,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324,0.299357
3,Afghanistan,South Asia,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175,0.307386
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919,0.43544


#### let's explore it a little bit...

checking the correlation will allow us to see which variables are most commonly related. this allows us to check to see which things relate to "Life Ladder", and also which independent variables relate to each other.

In [6]:
happiness_df.corr(numeric_only=True)

Unnamed: 0,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
Year,1.0,0.045943,0.077772,-0.02975,0.1635,0.234105,0.005726,-0.081358,0.019182,0.205369,0.012638
Life Ladder,0.045943,1.0,0.784871,0.721663,0.713493,0.534532,0.181658,-0.431569,0.518207,-0.339992,-0.081887
Log GDP Per Capita,0.077772,0.784871,1.0,0.683619,0.818126,0.36756,-0.0008,-0.352884,0.237986,-0.24756,-0.188351
Social Support,-0.02975,0.721663,0.683619,1.0,0.597682,0.409439,0.068593,-0.222584,0.431139,-0.4418,-0.169573
Healthy Life Expectancy At Birth,0.1635,0.713493,0.818126,0.597682,1.0,0.373448,0.010876,-0.299055,0.223119,-0.140726,-0.178003
Freedom To Make Life Choices,0.234105,0.534532,0.36756,0.409439,0.373448,1.0,0.325107,-0.476537,0.578752,-0.27547,0.408533
Generosity,0.005726,0.181658,-0.0008,0.068593,0.010876,0.325107,1.0,-0.279494,0.30714,-0.080837,0.294399
Perceptions Of Corruption,-0.081358,-0.431569,-0.352884,-0.222584,-0.299055,-0.476537,-0.279494,1.0,-0.28062,0.266267,-0.460789
Positive Affect,0.019182,0.518207,0.237986,0.431139,0.223119,0.578752,0.30714,-0.28062,1.0,-0.330236,0.123241
Negative Affect,0.205369,-0.339992,-0.24756,-0.4418,-0.140726,-0.27547,-0.080837,0.266267,-0.330236,1.0,-0.125631


we can also do a basic linear regression model with single variables to see what the R-value is of each indepedent variable against the dependent variable

In [44]:
happiness_df = happiness_df.dropna()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numbers_df = happiness_df.select_dtypes(include=numerics)

In [103]:
score_dict = {}

for variable in numbers_df.columns:
    temp_df = numbers_df.copy()
    lr = linear_model.LinearRegression()
    lr.fit(temp_df[[variable]], temp_df["Life Ladder"])
    score_dict[variable] = {"score": lr.score(temp_df[[variable]], numbers_df["Life Ladder"])}

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705
    },
    "Life Ladder": {
        "score": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784
    },
    "Social Support": {
        "score": 0.5192853987535683
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261
    },
    "Generosity": {
        "score": 0.03753479127663817
    },
    "Perceptions Of Corruption": {
        "score": 0.24222475409085542
    },
    "Positive Affect": {
        "score": 0.26300717570566623
    },
    "Negative Affect": {
        "score": 0.09594787291010276
    },
    "Confidence In National Government": {
        "score": 0.0047124322248692785
    },
    "int": {
        "score": 0.08155250752791565
    }
}


We can actually add the correlation to this now, too...

In [104]:
for variable in numbers_df.columns:
    lr = linear_model.LinearRegression()
    lr.fit(numbers_df[[variable]], numbers_df["Life Ladder"])
    score_dict[variable].update(
        {"correlation": numbers_df["Life Ladder"].corr(numbers_df[variable])}
    )

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434
    },
    "Perceptions Of Corruption": {
        "score": 0.24222475409085542,
        "correlation": -0.49216334086444846
    },
    "Positive Affect": {
        "score": 0.26300717570566623,
        "correlation": 0.5128422522624927
    }

let's try these variables to start...

In [105]:
columns = [
    "Positive Affect", 
    "Freedom To Make Life Choices",
    "Healthy Life Expectancy At Birth",
    "Social Support",
    "Log GDP Per Capita",
]

here are some easy ways to train and test the models!

In [106]:
def train_model(df, cols):
    X = df[cols]
    y = df["Life Ladder"]
    model = sm.OLS(y, X).fit()
    return model

def test_model(df, cols, model):
    X = df[cols]
    df_p = df
    df_p = df_p.assign(predicted_life_ladder=model.predict(X))
    df_p = df_p.assign(error=df_p.predicted_life_ladder - df_p["Life Ladder"])
    return df_p

def compute_rmse(df):
    return np.sqrt(np.square(df.error).mean())

lets split the data into a train and test dataset

In [107]:
train_df, test_df = train_test_split(happiness_df, test_size=0.75, random_state=58)

let's try training the model now!

In [108]:
model = train_model(train_df, columns)
model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.986
Model:,OLS,Adj. R-squared (uncentered):,0.986
Method:,Least Squares,F-statistic:,5897.0
Date:,"Fri, 19 May 2023",Prob (F-statistic):,0.0
Time:,18:39:45,Log-Likelihood:,-420.58
No. Observations:,420,AIC:,851.2
Df Residuals:,415,BIC:,871.4
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Positive Affect,0.8214,0.360,2.281,0.023,0.114,1.529
Freedom To Make Life Choices,0.9881,0.284,3.475,0.001,0.429,1.547
Healthy Life Expectancy At Birth,-0.0258,0.008,-3.045,0.002,-0.042,-0.009
Social Support,1.7565,0.397,4.422,0.000,0.976,2.537
Log GDP Per Capita,0.4751,0.061,7.843,0.000,0.356,0.594

0,1,2,3
Omnibus:,2.228,Durbin-Watson:,2.121
Prob(Omnibus):,0.328,Jarque-Bera (JB):,2.126
Skew:,-0.174,Prob(JB):,0.345
Kurtosis:,3.02,Cond. No.,888.0


In [109]:
predict_df = test_model(train_df, columns, model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.6586462588002616
Correlation: 0.8544157928721342


85% correlation is really good! we can try doing some feature engineering to see if we can get anything better

feature engineering is usually manipulating the independent variables you have access to, to see if combining or dividing them yields better variables

let's try a brute force method here

In [110]:
columns_list = numbers_df.columns
interactions = list(combinations(columns_list, 2))
interactions

[('Year', 'Life Ladder'),
 ('Year', 'Log GDP Per Capita'),
 ('Year', 'Social Support'),
 ('Year', 'Healthy Life Expectancy At Birth'),
 ('Year', 'Freedom To Make Life Choices'),
 ('Year', 'Generosity'),
 ('Year', 'Perceptions Of Corruption'),
 ('Year', 'Positive Affect'),
 ('Year', 'Negative Affect'),
 ('Year', 'Confidence In National Government'),
 ('Year', 'int'),
 ('Life Ladder', 'Log GDP Per Capita'),
 ('Life Ladder', 'Social Support'),
 ('Life Ladder', 'Healthy Life Expectancy At Birth'),
 ('Life Ladder', 'Freedom To Make Life Choices'),
 ('Life Ladder', 'Generosity'),
 ('Life Ladder', 'Perceptions Of Corruption'),
 ('Life Ladder', 'Positive Affect'),
 ('Life Ladder', 'Negative Affect'),
 ('Life Ladder', 'Confidence In National Government'),
 ('Life Ladder', 'int'),
 ('Log GDP Per Capita', 'Social Support'),
 ('Log GDP Per Capita', 'Healthy Life Expectancy At Birth'),
 ('Log GDP Per Capita', 'Freedom To Make Life Choices'),
 ('Log GDP Per Capita', 'Generosity'),
 ('Log GDP Per Cap

In [118]:
y = numbers_df["Life Ladder"]

for variable in interactions:
   temp_df = numbers_df.copy()
   var_1 = variable[0]
   var_2 = variable[1]
   temp_df['int'] = temp_df[var_1] * temp_df[var_2]
   lr1 = linear_model.LinearRegression()
   lr1.fit(temp_df[['int']], y)
   score_dict[f"{var_1} x {var_2}"] = {"score": lr1.score(temp_df[['int']], y)}
   score_dict[f"{var_1} x {var_2}"].update({"correlation": temp_df['int'].corr(y)})
   numbers_df[f"{var_1} x {var_2}"] = temp_df['int']

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634,
        "combined": 0.038448316929930254
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0,
        "combined": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085,
        "combined": 0.7103707248609934
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039,
        "combined": 0.6199499983430861
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881,
        "combined": 0.6378111630667761
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057,
        "combined": 0.4181916110654159
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434,
        "comb

In [112]:
for key in score_dict.keys():
    score_dict[key].update({"combined": np.average(
        [score_dict[key]["score"], score_dict[key]["correlation"]])}
    )

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634,
        "combined": 0.038448316929930254
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0,
        "combined": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085,
        "combined": 0.7103707248609934
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039,
        "combined": 0.6199499983430861
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881,
        "combined": 0.6378111630667761
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057,
        "combined": 0.4181916110654159
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434,
        "comb

In [114]:
keys = list(score_dict.keys())
values = [score["combined"] for score in score_dict.values()]
sorted_value_index = np.argsort(values)[::-1]
sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
 
print(json.dumps(sorted_dict, indent=4))

{
    "Life Ladder": 1.0,
    "Year x Life Ladder": 0.9999149043749638,
    "Life Ladder x Healthy Life Expectancy At Birth": 0.9604972407845338,
    "Life Ladder x Log GDP Per Capita": 0.9586713979837236,
    "Life Ladder x Social Support": 0.9416596365343406,
    "Life Ladder x Positive Affect": 0.8697452426399582,
    "Life Ladder x Freedom To Make Life Choices": 0.860992958777492,
    "Log GDP Per Capita x Social Support": 0.7575567385763021,
    "Social Support x Healthy Life Expectancy At Birth": 0.7532676941256362,
    "Log GDP Per Capita x Positive Affect": 0.7310206517842504,
    "Log GDP Per Capita x Healthy Life Expectancy At Birth": 0.7299493606743718,
    "Log GDP Per Capita": 0.7103707248609934,
    "Year x Log GDP Per Capita": 0.7100103485284618,
    "Log GDP Per Capita x Freedom To Make Life Choices": 0.6905783352901975,
    "Healthy Life Expectancy At Birth x Positive Affect": 0.6763345280841442,
    "Social Support x Freedom To Make Life Choices": 0.6622437151259137,


maybe we can try these variables?

In [122]:
train_df, test_df = train_test_split(numbers_df, test_size=0.75, random_state=58)

In [125]:
new_columns = [
    "Log GDP Per Capita x Social Support",
    "Healthy Life Expectancy At Birth x Positive Affect",
    "Freedom To Make Life Choices"
]

In [126]:
new_model = train_model(train_df, new_columns)
new_model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.99
Model:,OLS,Adj. R-squared (uncentered):,0.99
Method:,Least Squares,F-statistic:,13260.0
Date:,"Fri, 19 May 2023",Prob (F-statistic):,0.0
Time:,18:48:20,Log-Likelihood:,-359.44
No. Observations:,420,AIC:,724.9
Df Residuals:,417,BIC:,737.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Log GDP Per Capita x Social Support,0.3636,0.020,18.050,0.000,0.324,0.403
Healthy Life Expectancy At Birth x Positive Affect,0.0457,0.005,9.164,0.000,0.036,0.055
Freedom To Make Life Choices,1.0147,0.216,4.704,0.000,0.591,1.439

0,1,2,3
Omnibus:,30.096,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,65.177
Skew:,0.387,Prob(JB):,7.03e-15
Kurtosis:,4.768,Cond. No.,336.0


In [127]:
predict_df = test_model(train_df, new_columns, new_model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.5694230055432464
Correlation: 0.8735477061676316


We see that we bumped up the correlation to 87%, even better! And the RSME went down too! 