# **_Machine Learning Model Training_**

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
import numpy as np
from catboost import CatBoostRegressor

## **_Data Loading_**

In [13]:
df = pd.read_csv('../data/happiness_clean_data.csv')

In [14]:
df.head()

Unnamed: 0,country,happiness_rank,happiness_score,gdp_per_capita,social_support,life_expectancy,freedom,corruption,generosity,year,region
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015,Western Europe
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015,Western Europe
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015,Western Europe
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015,Western Europe
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015,North America


## **_Preprocessing_**

In [15]:
df = df.drop(columns=['country', 'happiness_rank', 'year', 'generosity'])

categorical_features = ['region']
numeric_features = ['gdp_per_capita', 'social_support', 'life_expectancy', 'freedom', 'corruption']
target = 'happiness_score'

## **_Data Splitting_**

In [16]:
X = df.drop(columns=['happiness_score'], axis=1)
y = df[target]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1425)

## **_Model Training_**

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [19]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Decision Tree': DecisionTreeRegressor(random_state=1425),
    'Random Forest': RandomForestRegressor(random_state=1425),
    'XGBoost': XGBRegressor(random_state=1425),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=1425)
}

In [20]:
cv = KFold(n_splits=5, shuffle=True, random_state=1425)

scoring = {
    'r2': 'r2',
    'rmse': make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))),
    'mae': make_scorer(mean_absolute_error)
}


metrics = []

for name, model in models.items():
    print(f"Training Model: {name}")
    
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    
    start = time.time()
    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=1)
    elapsed = time.time() - start

    metrics.append({
        'Model': name,
        'R2': round(np.mean(scores['test_r2']), 4),
        'RMSE': round(np.mean(scores['test_rmse']), 4),
        'MAE': round(np.mean(scores['test_mae']), 4),
        'Time (s)': round(elapsed, 2)
    })

Training Model: Linear Regression
Training Model: Ridge Regression
Training Model: Decision Tree
Training Model: Random Forest
Training Model: XGBoost
Training Model: CatBoost


In [21]:
results_df = pd.DataFrame(metrics).sort_values(by='R2', ascending=False)

print("\nModel Results:")
print(results_df)


Model Results:
               Model      R2    RMSE     MAE  Time (s)
5           CatBoost  0.8650  0.4112  0.3193      5.01
3      Random Forest  0.8379  0.4493  0.3470      1.27
4            XGBoost  0.8323  0.4571  0.3537      0.71
0  Linear Regression  0.8082  0.4904  0.3714      0.11
1   Ridge Regression  0.8080  0.4906  0.3716      0.05
2      Decision Tree  0.6808  0.6317  0.4660      0.06


In [22]:
import joblib

pipeline_rf = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', CatBoostRegressor(random_state=100))
])

pipeline_rf.fit(X_train, y_train)

Learning rate set to 0.038013
0:	learn: 1.1004805	total: 889us	remaining: 889ms
1:	learn: 1.0739828	total: 1.4ms	remaining: 697ms
2:	learn: 1.0462631	total: 5.33ms	remaining: 1.77s
3:	learn: 1.0218235	total: 6.11ms	remaining: 1.52s
4:	learn: 0.9980026	total: 9.23ms	remaining: 1.84s
5:	learn: 0.9760279	total: 10.8ms	remaining: 1.79s
6:	learn: 0.9541994	total: 12ms	remaining: 1.71s
7:	learn: 0.9321680	total: 13.2ms	remaining: 1.64s
8:	learn: 0.9126596	total: 13.9ms	remaining: 1.53s
9:	learn: 0.8924336	total: 15.1ms	remaining: 1.49s
10:	learn: 0.8742228	total: 15.6ms	remaining: 1.4s
11:	learn: 0.8549646	total: 16.6ms	remaining: 1.36s
12:	learn: 0.8364582	total: 17.9ms	remaining: 1.35s
13:	learn: 0.8193916	total: 18.3ms	remaining: 1.29s
14:	learn: 0.8038353	total: 19.5ms	remaining: 1.28s
15:	learn: 0.7872189	total: 20ms	remaining: 1.23s
16:	learn: 0.7729096	total: 20.6ms	remaining: 1.19s
17:	learn: 0.7597450	total: 22.4ms	remaining: 1.22s
18:	learn: 0.7457112	total: 23.2ms	remaining: 1.2s


In [23]:
y_pred_test = pipeline_rf.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"\n📊 EVALUACIÓN FINAL en el 20% de TEST:")
print(f" R2: {r2_test:.4f}")
print(f" MAE: {mae_test:.4f}")
print(f" RMSE: {rmse_test:.4f}")


📊 EVALUACIÓN FINAL en el 20% de TEST:
 R2: 0.8528
 MAE: 0.3191
 RMSE: 0.4269


In [24]:
joblib.dump(pipeline_rf, 'CatBoost_model.pkl')

print("Modelo fue guardado como 'CatBoost_model.pkl'")

Modelo fue guardado como 'CatBoost_model.pkl'
