# **_Machine Learning Model Training_**

In [105]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
import numpy as np
from catboost import CatBoostRegressor

## **_Data Loading_**

In [106]:
df = pd.read_csv('../data/happiness_clean_data.csv')

In [107]:
df.head()

Unnamed: 0,country,happiness_rank,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,generosity,year,region
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015,Europe
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015,Europe
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015,Europe
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015,Europe
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015,North America


## **_Preprocessing_**

In [108]:
df = df.drop(columns=['country', 'happiness_rank', 'year', 'generosity'])

categorical_features = ['region']
numeric_features = ['gdp_per_capita', 'social_support', 'life_expectancy', 'freedom', 'corruption']
target = 'happiness_score'

## **_Data Splitting_**

In [109]:
X = df.drop(columns=['happiness_score'], axis=1)
y = df[target]

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1425)

## **_Model Training_**

In [111]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [112]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Decision Tree': DecisionTreeRegressor(random_state=1425),
    'Random Forest': RandomForestRegressor(random_state=1425),
    'XGBoost': XGBRegressor(random_state=1425),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=1425)
}

In [113]:
cv = KFold(n_splits=5, shuffle=True, random_state=1425)

scoring = {
    'r2': 'r2',
    'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))),
    'mae': make_scorer(mean_absolute_error)
}


metrics = []

for name, model in models.items():
    print(f"Training Model: {name}")
    
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    
    start = time.time()
    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=1)
    elapsed = time.time() - start

    metrics.append({
        'Model': name,
        'R2': round(np.mean(scores['test_r2']), 4),
        'RMSE': round(np.mean(scores['test_rmse']), 4),
        'MAE': round(np.mean(scores['test_mae']), 4),
        'Time (s)': round(elapsed, 2)
    })

Training Model: Linear Regression
Training Model: Ridge Regression
Training Model: Decision Tree
Training Model: Random Forest
Training Model: XGBoost
Training Model: CatBoost


In [114]:
results_df = pd.DataFrame(metrics).sort_values(by='R2', ascending=False)

print("\nModel Results:")
print(results_df)


Model Results:
               Model      R2    RMSE     MAE  Time (s)
5           CatBoost  0.8463  0.4380  0.3382      3.80
3      Random Forest  0.8324  0.4556  0.3533      1.37
4            XGBoost  0.8181  0.4747  0.3685      1.76
0  Linear Regression  0.8014  0.4989  0.3797      0.28
1   Ridge Regression  0.8014  0.4989  0.3795      0.10
2      Decision Tree  0.6397  0.6683  0.5012      0.07


In [118]:
import joblib

pipeline_rf = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', CatBoostRegressor(random_state=100))
])

pipeline_rf.fit(X_train, y_train)

joblib.dump(pipeline_rf, 'CatBoost_model.pkl')

print("Modelo fue guardado como 'CatBoost_model.pkl'")

Learning rate set to 0.038013
0:	learn: 1.1025505	total: 38ms	remaining: 38s
1:	learn: 1.0742461	total: 43.8ms	remaining: 21.8s
2:	learn: 1.0485888	total: 45.6ms	remaining: 15.2s
3:	learn: 1.0234220	total: 46.2ms	remaining: 11.5s
4:	learn: 0.9983019	total: 46.8ms	remaining: 9.3s
5:	learn: 0.9762073	total: 47.3ms	remaining: 7.83s
6:	learn: 0.9538478	total: 48.3ms	remaining: 6.85s
7:	learn: 0.9320123	total: 48.8ms	remaining: 6.05s
8:	learn: 0.9115068	total: 49.3ms	remaining: 5.43s
9:	learn: 0.8900908	total: 49.8ms	remaining: 4.93s
10:	learn: 0.8722948	total: 50.4ms	remaining: 4.53s
11:	learn: 0.8537786	total: 51ms	remaining: 4.2s
12:	learn: 0.8364232	total: 53.3ms	remaining: 4.05s
13:	learn: 0.8191537	total: 55.5ms	remaining: 3.9s
14:	learn: 0.8017191	total: 56.7ms	remaining: 3.72s
15:	learn: 0.7864474	total: 57.4ms	remaining: 3.53s
16:	learn: 0.7728120	total: 58.1ms	remaining: 3.36s
17:	learn: 0.7579447	total: 58.6ms	remaining: 3.2s
18:	learn: 0.7438123	total: 59.1ms	remaining: 3.05s
19