In [1]:
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm

from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn import linear_model

#### import the data

In [2]:
happiness_df = pd.read_csv("../World Happiness Report.csv")
happiness_df.head()

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
0,Afghanistan,South Asia,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195,0.612072
1,Afghanistan,South Asia,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092,0.611545
2,Afghanistan,South Asia,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324,0.299357
3,Afghanistan,South Asia,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175,0.307386
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919,0.43544


#### let's explore it a little bit...

checking the correlation will allow us to see which variables are most commonly related. this allows us to check to see which things relate to "Life Ladder", and also which independent variables relate to each other.

In [3]:
happiness_df.corr(numeric_only=True)

Unnamed: 0,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government
Year,1.0,0.045943,0.077772,-0.02975,0.1635,0.234105,0.005726,-0.081358,0.019182,0.205369,0.012638
Life Ladder,0.045943,1.0,0.784871,0.721663,0.713493,0.534532,0.181658,-0.431569,0.518207,-0.339992,-0.081887
Log GDP Per Capita,0.077772,0.784871,1.0,0.683619,0.818126,0.36756,-0.0008,-0.352884,0.237986,-0.24756,-0.188351
Social Support,-0.02975,0.721663,0.683619,1.0,0.597682,0.409439,0.068593,-0.222584,0.431139,-0.4418,-0.169573
Healthy Life Expectancy At Birth,0.1635,0.713493,0.818126,0.597682,1.0,0.373448,0.010876,-0.299055,0.223119,-0.140726,-0.178003
Freedom To Make Life Choices,0.234105,0.534532,0.36756,0.409439,0.373448,1.0,0.325107,-0.476537,0.578752,-0.27547,0.408533
Generosity,0.005726,0.181658,-0.0008,0.068593,0.010876,0.325107,1.0,-0.279494,0.30714,-0.080837,0.294399
Perceptions Of Corruption,-0.081358,-0.431569,-0.352884,-0.222584,-0.299055,-0.476537,-0.279494,1.0,-0.28062,0.266267,-0.460789
Positive Affect,0.019182,0.518207,0.237986,0.431139,0.223119,0.578752,0.30714,-0.28062,1.0,-0.330236,0.123241
Negative Affect,0.205369,-0.339992,-0.24756,-0.4418,-0.140726,-0.27547,-0.080837,0.266267,-0.330236,1.0,-0.125631


we can also do a basic linear regression model with single variables to see what the R-value is of each indepedent variable against the dependent variable

In [4]:
happiness_df = happiness_df.dropna()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numbers_df = happiness_df.select_dtypes(include=numerics)

In [5]:
score_dict = {}

for variable in numbers_df.columns:
    temp_df = numbers_df.copy()
    lr = linear_model.LinearRegression()
    lr.fit(temp_df[[variable]], temp_df["Life Ladder"])
    score_dict[variable] = {"score": lr.score(temp_df[[variable]], numbers_df["Life Ladder"])}

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705
    },
    "Life Ladder": {
        "score": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784
    },
    "Social Support": {
        "score": 0.5192853987535683
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261
    },
    "Generosity": {
        "score": 0.03753479127663817
    },
    "Perceptions Of Corruption": {
        "score": 0.24222475409085542
    },
    "Positive Affect": {
        "score": 0.26300717570566623
    },
    "Negative Affect": {
        "score": 0.09594787291010276
    },
    "Confidence In National Government": {
        "score": 0.0047124322248692785
    }
}


We can actually add the correlation to this now, too...

In [6]:
for variable in numbers_df.columns:
    lr = linear_model.LinearRegression()
    lr.fit(numbers_df[[variable]], numbers_df["Life Ladder"])
    score_dict[variable].update(
        {"correlation": numbers_df["Life Ladder"].corr(numbers_df[variable])}
    )

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434
    },
    "Perceptions Of Corruption": {
        "score": 0.24222475409085542,
        "correlation": -0.49216334086444846
    },
    "Positive Affect": {
        "score": 0.26300717570566623,
        "correlation": 0.5128422522624927
    }

let's try these variables to start...

In [7]:
columns = [
    "Positive Affect", 
    "Freedom To Make Life Choices",
    "Healthy Life Expectancy At Birth",
    "Social Support",
    "Log GDP Per Capita",
]

here are some easy ways to train and test the models!

In [8]:
def train_model(df, cols):
    X = df[cols]
    y = df["Life Ladder"]
    model = sm.OLS(y, X).fit()
    return model

def test_model(df, cols, model):
    X = df[cols]
    df_p = df
    df_p = df_p.assign(predicted_life_ladder=model.predict(X))
    df_p = df_p.assign(error=df_p.predicted_life_ladder - df_p["Life Ladder"])
    return df_p

def compute_rmse(df):
    return np.sqrt(np.square(df.error).mean())

lets split the data into a train and test dataset

In [9]:
train_df, test_df = train_test_split(happiness_df, test_size=0.75, random_state=58)

let's try training the model now!

In [10]:
model = train_model(train_df, columns)
model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.986
Model:,OLS,Adj. R-squared (uncentered):,0.986
Method:,Least Squares,F-statistic:,5897.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,12:54:45,Log-Likelihood:,-420.58
No. Observations:,420,AIC:,851.2
Df Residuals:,415,BIC:,871.4
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Positive Affect,0.8214,0.360,2.281,0.023,0.114,1.529
Freedom To Make Life Choices,0.9881,0.284,3.475,0.001,0.429,1.547
Healthy Life Expectancy At Birth,-0.0258,0.008,-3.045,0.002,-0.042,-0.009
Social Support,1.7565,0.397,4.422,0.000,0.976,2.537
Log GDP Per Capita,0.4751,0.061,7.843,0.000,0.356,0.594

0,1,2,3
Omnibus:,2.228,Durbin-Watson:,2.121
Prob(Omnibus):,0.328,Jarque-Bera (JB):,2.126
Skew:,-0.174,Prob(JB):,0.345
Kurtosis:,3.02,Cond. No.,888.0


In [11]:
predict_df = test_model(train_df, columns, model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.6586462588002616
Correlation: 0.8544157928721342


85% correlation is really good! we can try doing some feature engineering to see if we can get anything better

feature engineering is usually manipulating the independent variables you have access to, to see if combining or dividing them yields better variables

let's try a brute force method here

In [12]:
columns_list = numbers_df.columns
interactions = list(combinations(columns_list, 2))
interactions

[('Year', 'Life Ladder'),
 ('Year', 'Log GDP Per Capita'),
 ('Year', 'Social Support'),
 ('Year', 'Healthy Life Expectancy At Birth'),
 ('Year', 'Freedom To Make Life Choices'),
 ('Year', 'Generosity'),
 ('Year', 'Perceptions Of Corruption'),
 ('Year', 'Positive Affect'),
 ('Year', 'Negative Affect'),
 ('Year', 'Confidence In National Government'),
 ('Life Ladder', 'Log GDP Per Capita'),
 ('Life Ladder', 'Social Support'),
 ('Life Ladder', 'Healthy Life Expectancy At Birth'),
 ('Life Ladder', 'Freedom To Make Life Choices'),
 ('Life Ladder', 'Generosity'),
 ('Life Ladder', 'Perceptions Of Corruption'),
 ('Life Ladder', 'Positive Affect'),
 ('Life Ladder', 'Negative Affect'),
 ('Life Ladder', 'Confidence In National Government'),
 ('Log GDP Per Capita', 'Social Support'),
 ('Log GDP Per Capita', 'Healthy Life Expectancy At Birth'),
 ('Log GDP Per Capita', 'Freedom To Make Life Choices'),
 ('Log GDP Per Capita', 'Generosity'),
 ('Log GDP Per Capita', 'Perceptions Of Corruption'),
 ('Log 

In [13]:
y = numbers_df["Life Ladder"]

for variable in interactions:
   temp_df = numbers_df.copy()
   var_1 = variable[0]
   var_2 = variable[1]
   temp_df['int'] = temp_df[var_1] * temp_df[var_2]
   lr1 = linear_model.LinearRegression()
   lr1.fit(temp_df[['int']], y)
   score_dict[f"{var_1} x {var_2}"] = {"score": lr1.score(temp_df[['int']], y)}
   score_dict[f"{var_1} x {var_2}"].update({"correlation": temp_df['int'].corr(y)})
   numbers_df[f"{var_1} x {var_2}"] = temp_df['int']

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434
    },
    "Perceptions Of Corruption": {
        "score": 0.24222475409085542,
        "correlation": -0.49216334086444846
    },
    "Positive Affect": {
        "score": 0.26300717570566623,
        "correlation": 0.5128422522624927
    }

In [14]:
for key in score_dict.keys():
    score_dict[key].update({"combined": np.average(
        [score_dict[key]["score"], score_dict[key]["correlation"]])}
    )

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0051478832185841705,
        "correlation": 0.07174875064127634,
        "combined": 0.038448316929930254
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0,
        "combined": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6281698077144784,
        "correlation": 0.7925716420075085,
        "combined": 0.7103707248609934
    },
    "Social Support": {
        "score": 0.5192853987535683,
        "correlation": 0.7206145979326039,
        "combined": 0.6199499983430861
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5404614758255641,
        "correlation": 0.7351608503079881,
        "combined": 0.6378111630667761
    },
    "Freedom To Make Life Choices": {
        "score": 0.2940861329423261,
        "correlation": 0.5422970891885057,
        "combined": 0.4181916110654159
    },
    "Generosity": {
        "score": 0.03753479127663817,
        "correlation": 0.1937389771745434,
        "comb

In [15]:
keys = list(score_dict.keys())
values = [score["combined"] for score in score_dict.values()]
sorted_value_index = np.argsort(values)[::-1]
sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
 
print(json.dumps(sorted_dict, indent=4))

{
    "Life Ladder": 1.0,
    "Year x Life Ladder": 0.9999149043749638,
    "Life Ladder x Healthy Life Expectancy At Birth": 0.9604972407845338,
    "Life Ladder x Log GDP Per Capita": 0.9586713979837236,
    "Life Ladder x Social Support": 0.9416596365343406,
    "Life Ladder x Positive Affect": 0.8697452426399582,
    "Life Ladder x Freedom To Make Life Choices": 0.860992958777492,
    "Log GDP Per Capita x Social Support": 0.7575567385763021,
    "Social Support x Healthy Life Expectancy At Birth": 0.7532676941256362,
    "Log GDP Per Capita x Positive Affect": 0.7310206517842504,
    "Log GDP Per Capita x Healthy Life Expectancy At Birth": 0.7299493606743718,
    "Log GDP Per Capita": 0.7103707248609934,
    "Year x Log GDP Per Capita": 0.7100103485284618,
    "Log GDP Per Capita x Freedom To Make Life Choices": 0.6905783352901975,
    "Healthy Life Expectancy At Birth x Positive Affect": 0.6763345280841442,
    "Social Support x Freedom To Make Life Choices": 0.6622437151259137,


maybe we can try these variables?

In [16]:
train_df, test_df = train_test_split(numbers_df, test_size=0.75, random_state=58)

In [17]:
new_columns = [
    "Log GDP Per Capita x Social Support",
    "Healthy Life Expectancy At Birth x Positive Affect",
    "Freedom To Make Life Choices"
]

In [18]:
new_model = train_model(train_df, new_columns)
new_model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.99
Model:,OLS,Adj. R-squared (uncentered):,0.99
Method:,Least Squares,F-statistic:,13260.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,12:54:46,Log-Likelihood:,-359.44
No. Observations:,420,AIC:,724.9
Df Residuals:,417,BIC:,737.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Log GDP Per Capita x Social Support,0.3636,0.020,18.050,0.000,0.324,0.403
Healthy Life Expectancy At Birth x Positive Affect,0.0457,0.005,9.164,0.000,0.036,0.055
Freedom To Make Life Choices,1.0147,0.216,4.704,0.000,0.591,1.439

0,1,2,3
Omnibus:,30.096,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,65.177
Skew:,0.387,Prob(JB):,7.03e-15
Kurtosis:,4.768,Cond. No.,336.0


In [19]:
predict_df = test_model(train_df, new_columns, new_model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.5694230055432464
Correlation: 0.8735477061676316


We see that we bumped up the correlation to 87%, even better! And the RSME went down too! 

### Let's look at the feature selection stuff from Sklearn

In [49]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression\

cols = [column for column in score_dict.keys() if "Life Ladder" not in column]

X = train_df[cols]
y = train_df["Life Ladder"]

# select top X features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k=10)
selector.fit(X, y.values.ravel())
X_columns_SKB = list(X.columns[selector.get_support()])

X_columns_SKB

['Log GDP Per Capita',
 'Healthy Life Expectancy At Birth',
 'Year x Log GDP Per Capita',
 'Year x Healthy Life Expectancy At Birth',
 'Log GDP Per Capita x Social Support',
 'Log GDP Per Capita x Healthy Life Expectancy At Birth',
 'Log GDP Per Capita x Freedom To Make Life Choices',
 'Log GDP Per Capita x Positive Affect',
 'Social Support x Healthy Life Expectancy At Birth',
 'Social Support x Freedom To Make Life Choices']

In [54]:
X_columns_SKB = [
    # 'Log GDP Per Capita',
    # 'Healthy Life Expectancy At Birth',
    # 'Year x Log GDP Per Capita',
    # 'Year x Healthy Life Expectancy At Birth',
    # 'Log GDP Per Capita x Social Support',
    'Log GDP Per Capita x Healthy Life Expectancy At Birth',
    # 'Log GDP Per Capita x Freedom To Make Life Choices',
    # 'Log GDP Per Capita x Positive Affect',
    # 'Social Support x Healthy Life Expectancy At Birth',
    'Social Support x Freedom To Make Life Choices'
 ]

### Retest it 

In [55]:
new_model = train_model(train_df, X_columns_SKB)
new_model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.988
Model:,OLS,Adj. R-squared (uncentered):,0.988
Method:,Least Squares,F-statistic:,17890.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,13:10:13,Log-Likelihood:,-381.93
No. Observations:,420,AIC:,767.9
Df Residuals:,418,BIC:,775.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Log GDP Per Capita x Healthy Life Expectancy At Birth,0.0064,0.000,27.864,0.000,0.006,0.007
Social Support x Freedom To Make Life Choices,2.6353,0.219,12.032,0.000,2.205,3.066

0,1,2,3
Omnibus:,16.29,Durbin-Watson:,2.162
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.182
Skew:,0.37,Prob(JB):,4.14e-05
Kurtosis:,3.778,Cond. No.,4540.0


In [56]:
predict_df = test_model(train_df, X_columns_SKB, new_model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.600740926330158
Correlation: 0.8664091616161355


### Bringing in external data

In [61]:
raw_fertility_df = pd.read_excel("../tfr-by-gapminder.xlsx", sheet_name=1)
raw_fertility_df

Unnamed: 0,geo.name,indicator.name,geo,indicator,1800,1801,1802,1803,1804,1805,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,Abkhazia,Total fertility rate,abkh,tfr,,,,,,,...,,,,,,,,,,
1,Afghanistan,Total fertility rate,afg,tfr,7.00,7.00,7.00,7.00,7.00,7.00,...,1.74,1.74,1.74,1.74,1.74,1.74,1.74,1.74,1.74,1.74
2,Akrotiri and Dhekelia,Total fertility rate,akr_a_dhe,tfr,,,,,,,...,,,,,,,,,,
3,Albania,Total fertility rate,alb,tfr,4.60,4.60,4.60,4.60,4.60,4.60,...,1.78,1.78,1.78,1.79,1.79,1.79,1.79,1.79,1.79,1.79
4,Algeria,Total fertility rate,dza,tfr,6.99,6.99,6.99,6.99,6.99,6.99,...,1.86,1.86,1.86,1.86,1.86,1.86,1.86,1.86,1.86,1.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,Northern Marianas,Total fertility rate,n_marianas,tfr,,,,,,,...,,,,,,,,,,
271,South Georgia and the South Sandwich Islands,Total fertility rate,sgero_a_ssandw,tfr,,,,,,,...,,,,,,,,,,
272,US Minor Outlying Islands,Total fertility rate,usa_minor_out_isl,tfr,,,,,,,...,,,,,,,,,,
273,Virgin Islands,Total fertility rate,virg_isl,tfr,,,,,,,...,,,,,,,,,,


In [71]:
fertility_df = raw_fertility_df[["geo.name", 2016]].rename(columns={"geo.name": "Country Name", 2016: "Fertility Rate"})
fertility_df

Unnamed: 0,Country Name,Fertility Rate
0,Abkhazia,
1,Afghanistan,4.64
2,Akrotiri and Dhekelia,
3,Albania,1.71
4,Algeria,2.78
...,...,...
270,Northern Marianas,
271,South Georgia and the South Sandwich Islands,
272,US Minor Outlying Islands,
273,Virgin Islands,


In [73]:
combined_df = pd.merge(happiness_df, fertility_df)
combined_df

Unnamed: 0,Country Name,Regional Indicator,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,Confidence In National Government,Fertility Rate
0,Afghanistan,South Asia,2008,3.723590,7.350416,0.450662,50.500000,0.718114,0.167652,0.881686,0.414297,0.258195,0.612072,4.64
1,Afghanistan,South Asia,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092,0.611545,4.64
2,Afghanistan,South Asia,2010,4.758381,7.613900,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324,0.299357,4.64
3,Afghanistan,South Asia,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175,0.307386,4.64
4,Afghanistan,South Asia,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.775620,0.613513,0.267919,0.435440,4.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,Zimbabwe,Sub-Saharan Africa,2017,3.638300,7.754387,0.754147,52.150002,0.752826,-0.080725,0.751208,0.733641,0.224051,0.682647,3.76
1610,Zimbabwe,Sub-Saharan Africa,2018,3.616480,7.783066,0.775388,52.625000,0.762675,-0.051219,0.844209,0.657524,0.211726,0.550508,3.76
1611,Zimbabwe,Sub-Saharan Africa,2019,2.693523,7.697755,0.759162,53.099998,0.631908,-0.047464,0.830652,0.658434,0.235354,0.456455,3.76
1612,Zimbabwe,Sub-Saharan Africa,2020,3.159802,7.596050,0.717243,53.575001,0.643303,0.006313,0.788523,0.660658,0.345736,0.577302,3.76


In [74]:
numbers_df = combined_df.select_dtypes(include=numerics)
score_dict = {}

for variable in numbers_df.columns:
    temp_df = numbers_df.copy()
    lr = linear_model.LinearRegression()
    lr.fit(temp_df[[variable]], temp_df["Life Ladder"])
    score_dict[variable] = {"score": lr.score(temp_df[[variable]], numbers_df["Life Ladder"])}

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0048089938640693
    },
    "Life Ladder": {
        "score": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6312612709625487
    },
    "Social Support": {
        "score": 0.5246476429232784
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5398453760503432
    },
    "Freedom To Make Life Choices": {
        "score": 0.30600144452139455
    },
    "Generosity": {
        "score": 0.03895921198883456
    },
    "Perceptions Of Corruption": {
        "score": 0.2514971792609363
    },
    "Positive Affect": {
        "score": 0.26404435150713734
    },
    "Negative Affect": {
        "score": 0.0993745881642103
    },
    "Confidence In National Government": {
        "score": 0.0037497060969370333
    },
    "Fertility Rate": {
        "score": 0.35698354902135243
    }
}


In [75]:
for variable in numbers_df.columns:
    lr = linear_model.LinearRegression()
    lr.fit(numbers_df[[variable]], numbers_df["Life Ladder"])
    score_dict[variable].update(
        {"correlation": numbers_df["Life Ladder"].corr(numbers_df[variable])}
    )

print(json.dumps(score_dict, indent=4))

{
    "Year": {
        "score": 0.0048089938640693,
        "correlation": 0.06934690954952095
    },
    "Life Ladder": {
        "score": 1.0,
        "correlation": 1.0
    },
    "Log GDP Per Capita": {
        "score": 0.6312612709625487,
        "correlation": 0.7945195220776826
    },
    "Social Support": {
        "score": 0.5246476429232784,
        "correlation": 0.7243256470147101
    },
    "Healthy Life Expectancy At Birth": {
        "score": 0.5398453760503432,
        "correlation": 0.7347417070306703
    },
    "Freedom To Make Life Choices": {
        "score": 0.30600144452139455,
        "correlation": 0.5531739731055636
    },
    "Generosity": {
        "score": 0.03895921198883456,
        "correlation": 0.19738088050476046
    },
    "Perceptions Of Corruption": {
        "score": 0.2514971792609363,
        "correlation": -0.501494944402171
    },
    "Positive Affect": {
        "score": 0.26404435150713734,
        "correlation": 0.5138524608359266
    },
  

### Lets look at taking the square root or square of a variable

In [93]:
for col in numbers_df.columns:
    numbers_df[col + " ^ Squared"] = numbers_df[col].pow(2)
    numbers_df[col + " ^ Sqrt"] = numbers_df[col].pow(1/2)

numbers_df

Unnamed: 0,Year,Life Ladder,Log GDP Per Capita,Social Support,Healthy Life Expectancy At Birth,Freedom To Make Life Choices,Generosity,Perceptions Of Corruption,Positive Affect,Negative Affect,...,Perceptions Of Corruption ^ Squared,Perceptions Of Corruption ^ Sqrt,Positive Affect ^ Squared,Positive Affect ^ Sqrt,Negative Affect ^ Squared,Negative Affect ^ Sqrt,Confidence In National Government ^ Squared,Confidence In National Government ^ Sqrt,Fertility Rate ^ Squared,Fertility Rate ^ Sqrt
0,2008,3.723590,7.350416,0.450662,50.500000,0.718114,0.167652,0.881686,0.414297,0.258195,...,0.777371,0.938982,0.171642,0.643659,0.066665,0.508129,0.374632,0.782350,21.5296,2.154066
1,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092,...,0.722560,0.921974,0.231767,0.693845,0.056213,0.486921,0.373988,0.782014,21.5296,2.154066
2,2010,4.758381,7.613900,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324,...,0.499518,0.840694,0.267193,0.718962,0.075803,0.524713,0.089615,0.547136,21.5296,2.154066
3,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175,...,0.534520,0.855049,0.230241,0.692701,0.071382,0.516889,0.094486,0.554424,21.5296,2.154066
4,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.775620,0.613513,0.267919,...,0.601586,0.880693,0.376398,0.783271,0.071781,0.517609,0.189608,0.659879,21.5296,2.154066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,2017,3.638300,7.754387,0.754147,52.150002,0.752826,-0.080725,0.751208,0.733641,0.224051,...,0.564313,0.866723,0.538230,0.856529,0.050199,0.473341,0.466007,0.826224,14.1376,1.939072
1610,2018,3.616480,7.783066,0.775388,52.625000,0.762675,-0.051219,0.844209,0.657524,0.211726,...,0.712688,0.918808,0.432337,0.810878,0.044828,0.460137,0.303059,0.741962,14.1376,1.939072
1611,2019,2.693523,7.697755,0.759162,53.099998,0.631908,-0.047464,0.830652,0.658434,0.235354,...,0.689983,0.911401,0.433535,0.811439,0.055392,0.485133,0.208351,0.675615,14.1376,1.939072
1612,2020,3.159802,7.596050,0.717243,53.575001,0.643303,0.006313,0.788523,0.660658,0.345736,...,0.621768,0.887988,0.436469,0.812809,0.119534,0.587994,0.333277,0.759804,14.1376,1.939072


### Retest it 

In [83]:
train_df, test_df = train_test_split(combined_df, test_size=0.75, random_state=58)

### Let's check the correlation

In [98]:
numbers_df.corr(numeric_only=True)["Life Ladder"].sort_values(ascending=False)

Life Ladder                                    1.000000
Life Ladder ^ Sqrt                             0.997969
Life Ladder ^ Squared                          0.993288
Log GDP Per Capita ^ Squared                   0.802465
Log GDP Per Capita                             0.794520
Log GDP Per Capita ^ Sqrt                      0.789081
Healthy Life Expectancy At Birth ^ Squared     0.758752
Social Support ^ Squared                       0.747078
Healthy Life Expectancy At Birth               0.734742
Social Support                                 0.724326
Healthy Life Expectancy At Birth ^ Sqrt        0.709770
Social Support ^ Sqrt                          0.708488
Freedom To Make Life Choices ^ Squared         0.570541
Freedom To Make Life Choices                   0.553174
Freedom To Make Life Choices ^ Sqrt            0.540870
Positive Affect ^ Squared                      0.517140
Positive Affect                                0.513852
Positive Affect ^ Sqrt                         0

In [87]:
combined_columns = [
    "Positive Affect", 
    "Freedom To Make Life Choices",
    "Healthy Life Expectancy At Birth",
    "Social Support",
    "Log GDP Per Capita",
    'Fertility Rate',
]

In [88]:
model = train_model(train_df, combined_columns)
model.summary()

0,1,2,3
Dep. Variable:,Life Ladder,R-squared (uncentered):,0.989
Model:,OLS,Adj. R-squared (uncentered):,0.989
Method:,Least Squares,F-statistic:,6101.0
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,13:28:04,Log-Likelihood:,-361.27
No. Observations:,403,AIC:,734.5
Df Residuals:,397,BIC:,758.5
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Positive Affect,2.1525,0.366,5.874,0.000,1.432,2.873
Freedom To Make Life Choices,1.3028,0.273,4.773,0.000,0.766,1.839
Healthy Life Expectancy At Birth,-0.0078,0.008,-1.009,0.313,-0.023,0.007
Social Support,1.4223,0.410,3.467,0.001,0.616,2.229
Log GDP Per Capita,0.3381,0.060,5.642,0.000,0.220,0.456
Fertility Rate,-0.2580,0.023,-11.362,0.000,-0.303,-0.213

0,1,2,3
Omnibus:,18.112,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.241
Skew:,-0.34,Prob(JB):,1.22e-06
Kurtosis:,4.077,Cond. No.,986.0


In [89]:
predict_df = test_model(train_df, combined_columns, model)

print("RMSE:", compute_rmse(predict_df))
print("Correlation:", predict_df["Life Ladder"].corr(predict_df["predicted_life_ladder"]))

RMSE: 0.593048248705372
Correlation: 0.8678868004767604
