In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
import numpy as np
from sklearn.linear_model import Ridge, Lasso, ElasticNet

CLEAN_CSV = "../data/happiness_2015to2019_cleaned.csv"
df = pd.read_csv(CLEAN_CSV)
MODEL_PATH = "../model/happiness_regression.pkl"
SPLIT_MAP_PATH = "../model/train_test_split.csv"
CLEAN_WITH_SPLIT = "../data/happiness_2015to2019_cleaned_with_split.csv"
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)


In [2]:
FEATURES = [
    "GDP_per_Capita", "Social_Support", "Healthy_Life_Expectancy",
    "Freedom", "Generosity", "Perceptions_of_Corruption"
]
TARGET = "Happiness_Score"

X = df[FEATURES].copy()
y = df[TARGET].copy()

In [3]:
df["Key"] = df["Country"].astype(str) + "|" + df["Year"].astype(int).astype(str)

# 4) Split 70/30 (test_size=0.30 como pidió el profe)
X_train, X_test, y_train, y_test, key_train, key_test = train_test_split(
    X, y, df["Key"], test_size=0.30, random_state=42
)

# 5) Chequeos de NaN
print(df[FEATURES + [TARGET]].isna().sum().sort_values(ascending=False))
print("NaN en X_train:\n", X_train.isna().sum().sort_values(ascending=False))
print("NaN en y_train:", y_train.isna().sum())


GDP_per_Capita               0
Social_Support               0
Healthy_Life_Expectancy      0
Freedom                      0
Generosity                   0
Perceptions_of_Corruption    0
Happiness_Score              0
dtype: int64
NaN en X_train:
 GDP_per_Capita               0
Social_Support               0
Healthy_Life_Expectancy      0
Freedom                      0
Generosity                   0
Perceptions_of_Corruption    0
dtype: int64
NaN en y_train: 0


In [4]:
# 6) Entrenar modelos
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

ridge = Ridge(alpha=1.0).fit(X_train, y_train)
ypred_ridge = ridge.predict(X_test)

lasso = Lasso(alpha=0.1).fit(X_train, y_train)
ypred_lasso = lasso.predict(X_test)

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5).fit(X_train, y_train)
ypred_elastic = elastic.predict(X_test)

# 7) Métricas
r2_lin  = r2_score(y_test, y_pred)
mae_lin = mean_absolute_error(y_test, y_pred)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred))

r2_ridge  = r2_score(y_test, ypred_ridge)
mae_ridge = mean_absolute_error(y_test, ypred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, ypred_ridge))

r2_lasso  = r2_score(y_test, ypred_lasso)
mae_lasso = mean_absolute_error(y_test, ypred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, ypred_lasso))

r2_elastic  = r2_score(y_test, ypred_elastic)
mae_elastic = mean_absolute_error(y_test, ypred_elastic)
rmse_elastic = np.sqrt(mean_squared_error(y_test, ypred_elastic))

resultados = pd.DataFrame({
    "Modelo": ["Lineal", "Ridge", "Lasso", "Elastic Net"],
    "R2":     [r2_lin, r2_ridge, r2_lasso, r2_elastic],
    "MAE":    [mae_lin, mae_ridge, mae_lasso, mae_elastic],
    "RMSE":   [rmse_lin, rmse_ridge, rmse_lasso, rmse_elastic]
}).sort_values("RMSE")


print("\nResultados en TEST (ordenado por RMSE asc):\n", resultados)


Resultados en TEST (ordenado por RMSE asc):
         Modelo        R2       MAE      RMSE
1        Ridge  0.713785  0.466613  0.597792
0       Lineal  0.711784  0.466901  0.599878
3  Elastic Net  0.577811  0.578578  0.726035
2        Lasso  0.550739  0.590066  0.748951


In [5]:
# 8) Guardar el modelo lineal (el que elegiste)
joblib.dump(model, MODEL_PATH)
print(f"\nModelo lineal guardado en: {MODEL_PATH}")

# 9) Construir y guardar el mapa Key -> Data_Set
split_map = pd.DataFrame({
    "Key": pd.concat([key_train, key_test], axis=0),
    "Data_Set": (["train"] * len(key_train)) + (["test"] * len(key_test))
})
split_map.to_csv(SPLIT_MAP_PATH, index=False)
print(f"Mapa train/test guardado en: {SPLIT_MAP_PATH}")

# 10) Agregar Data_Set al dataframe completo y guardar CSV con la columna
df = df.merge(split_map, on="Key", how="left")
df["Data_Set"] = df["Data_Set"].fillna("train")  # por si acaso
df.drop(columns=["Key"], inplace=True)
df.to_csv(CLEAN_WITH_SPLIT, index=False)
print(f"Dataset con Data_Set guardado en: {CLEAN_WITH_SPLIT}")


Modelo lineal guardado en: ../model/happiness_regression.pkl
Mapa train/test guardado en: ../model/train_test_split.csv
Dataset con Data_Set guardado en: ../data/happiness_2015to2019_cleaned_with_split.csv


In [6]:
#joblib.dump(model, "../model/happiness_regression.pkl")
