In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
happy_df = pd.read_csv("happiness_score_dataset.csv")
happy_df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [20]:
happy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [21]:
missingval = happy_df.isnull().sum()
missingval

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64

In [22]:
statistics = happy_df.describe()
statistics

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [23]:
X = happy_df[['Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 
                  'Freedom', 'Generosity', 'Trust (Government Corruption)']]
y = happy_df['Happiness Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
X_train_scaled.shape

(126, 6)

In [25]:
X_test_scaled.shape

(32, 6)

In [26]:
df_model = LinearRegression()
df_model.fit(X_train_scaled, y_train)
y_pred = df_model.predict(X_test_scaled)
rmse_score = mean_squared_error(y_test, y_pred, squared=False)
f'RMSE: {rmse_score}'

'RMSE: 0.49187277657503803'

In [27]:
random_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_model.fit(X_train_scaled, y_train)
y_pred_random = random_model.predict(X_test_scaled)
rmse_random = mean_squared_error(y_test, y_pred_random, squared=False)
f'RF RMSE: {rmse_random}'

'RF RMSE: 0.512538403399565'

In [28]:
from sklearn.model_selection import GridSearchCV
param_Hgrid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_searchCV = GridSearchCV(RandomForestRegressor(random_state=42), param_Hgrid, cv=5, scoring='neg_mean_squared_error')
grid_searchCV.fit(X_train_scaled, y_train)
Hbest_model = grid_searchCV.best_estimator_
y_pred_Hbest = Hbest_model.predict(X_test_scaled)
rmse_Hbest = mean_squared_error(y_test, y_pred_Hbest, squared=False)
f'HBest RMSE: {rmse_Hbest}'

'HBest RMSE: 0.5399633877854125'

By above various RMSE scores, linear regression performed well in terms of predicting happiness score which means model can effectively capture the relationship between the overall happineass score and the predictors, such as GDP, etc.