In [157]:
# Import our dependencies
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [158]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [159]:
import pandas as pd
import os

# Replace 'YourFolderName' with the actual folder name
file_path = '/content/drive/MyDrive/Machine_Learning_Project/Happinness_Data.xlsx'

# Read the Excel file
happy_df = pd.read_excel(file_path)

# Optional: Verify the file was read correctly
happy_df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.723590,7.350416,0.450662,50.500000,0.718114,0.164055,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.613900,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.775620,0.613513,0.267919
...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,2.693523,7.697755,0.759162,53.099998,0.631908,-0.050874,0.830652,0.658434,0.235354
2359,Zimbabwe,2020,3.159802,7.596050,0.717243,53.575001,0.643303,0.002848,0.788523,0.660658,0.345736
2360,Zimbabwe,2021,3.154578,7.656878,0.685151,54.049999,0.667636,-0.079007,0.756945,0.609917,0.241682
2361,Zimbabwe,2022,3.296220,7.670073,0.666172,54.525002,0.651987,-0.072935,0.752632,0.640609,0.191350


BOAST MODEL

In [160]:
# Drop unnecesary columns from data set "Country name" and "year"
happy_df.drop(["Country name", "year"], axis=1, inplace=True)
happy_df.head()

Unnamed: 0,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,3.72359,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195
1,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092
2,4.758381,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324
3,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175
4,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919


In [161]:
#Drop NaN for model
happy_df = happy_df.dropna()

In [162]:
#Identify features
X = happy_df.drop('Life Ladder', axis=1)

In [163]:
#Identify target variable
y = happy_df['Life Ladder']
y[:5]

Unnamed: 0,Life Ladder
0,3.72359
1,4.401778
2,4.758381
3,3.831719
4,3.782938


In [164]:
# Proper train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)


In [166]:
# Create and train the Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(
    n_estimators=300,  # Number of boosting stages
    learning_rate=0.01,  # learning rate
    max_depth=3,  # Maximum depth of individual trees
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    loss='squared_error',
    random_state=42
)

In [167]:
# Fit the model
gb_regressor.fit(X_train, y_train)

In [168]:
# Make predictions
y_pred = gb_regressor.predict(X_test)


In [169]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R-squared Score: {r2:.4f}")

Model Performance:
Mean Squared Error: 0.2030
Mean Absolute Error: 0.3488
R-squared Score: 0.8347


In [170]:
# Feature importance
feature_importance = gb_regressor.feature_importances_
feature_names = X.columns.tolist()

# Create a DataFrame of feature importances
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nFeature Importances:")
print(feature_imp_df)


Feature Importances:
                            feature  importance
0                Log GDP per capita    0.504486
1                    Social support    0.225135
6                   Positive affect    0.118543
2  Healthy life expectancy at birth    0.109400
3      Freedom to make life choices    0.021974
5         Perceptions of corruption    0.008168
4                        Generosity    0.006216
7                   Negative affect    0.006076


In [171]:
#Checking for overfitting
gb_regressor.fit(X_train, y_train)

train_pred = gb_regressor.predict(X_train)
test_pred = gb_regressor.predict(X_test)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

train_mse = mean_squared_error(y_train, train_pred)
test_mse = mean_squared_error(y_test, test_pred)

print("Overfitting Diagnostics:")
print(" Performance Comparison:")
print(f"   Training R² Score: {train_r2:.4f}")
print(f"   Testing R² Score:  {test_r2:.4f}")
print(f"   Training MSE:      {train_mse:.4f}")
print(f"   Testing MSE:       {test_mse:.4f}")


Overfitting Diagnostics:
 Performance Comparison:
   Training R² Score: 0.8483
   Testing R² Score:  0.8347
   Training MSE:      0.1984
   Testing MSE:       0.2030


Gradient Boost Model #2

In [172]:
#Identify features
X = happy_df.drop('Life Ladder', axis=1)

In [173]:
#Identify target variable
y = happy_df['Life Ladder']
y[:5]

Unnamed: 0,Life Ladder
0,3.72359
1,4.401778
2,4.758381
3,3.831719
4,3.782938


In [174]:
# Proper train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [175]:
# Create and train the Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(
    n_estimators=300,  # Number of boosting stages
    learning_rate=0.01,  # learning rate
    max_depth=5,  # Maximum depth of individual trees
    min_samples_split=10,  # Minimum number of samples required to split an internal node
    max_features = 'sqrt', #Max features to consider when spliting
    loss='squared_error',
    random_state=42
)

In [176]:
# Fit the grid search
gb_regressor.fit(X_train, y_train)

In [177]:
# Make predictions
y_pred = gb_regressor.predict(X_test)

In [178]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print results
print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R-squared Score: {r2:.4f}")


Model Performance:
Mean Squared Error: 0.2095
Mean Absolute Error: 0.3537
R-squared Score: 0.8541


In [179]:
# Feature importance
feature_importance = gb_regressor.feature_importances_
feature_names = X.columns.tolist()

# Create a DataFrame of feature importances
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nFeature Importances:")
print(feature_imp_df)


Feature Importances:
                            feature  importance
0                Log GDP per capita    0.350485
1                    Social support    0.181580
2  Healthy life expectancy at birth    0.178234
6                   Positive affect    0.108517
3      Freedom to make life choices    0.069160
5         Perceptions of corruption    0.060095
7                   Negative affect    0.032011
4                        Generosity    0.019918


In [180]:
#Checking for overfitting
gb_regressor.fit(X_train, y_train)

train_pred = gb_regressor.predict(X_train)
test_pred = gb_regressor.predict(X_test)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

train_mse = mean_squared_error(y_train, train_pred)
test_mse = mean_squared_error(y_test, test_pred)

print("Overfitting Diagnostics:")
print(" Performance Comparison:")
print(f"   Training R² Score: {train_r2:.4f}")
print(f"   Testing R² Score:  {test_r2:.4f}")
print(f"   Training MSE:      {train_mse:.4f}")
print(f"   Testing MSE:       {test_mse:.4f}")


Overfitting Diagnostics:
 Performance Comparison:
   Training R² Score: 0.8988
   Testing R² Score:  0.8541
   Training MSE:      0.1270
   Testing MSE:       0.2095
