In [2]:
# ============================================
# 1. Imports
# ============================================
import numpy as np
import pandas as pd

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

In [3]:
# ============================================
# 2. Load encoded datasets (train + test) for linear/baseline
# ============================================
processed_path = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\processed"

X_train_encoded = np.load(f"{processed_path}\\X_train_encoded.npy")
X_test_encoded  = np.load(f"{processed_path}\\X_test_encoded.npy")

y_train = pd.read_csv(f"{processed_path}\\y_train.csv").squeeze()
y_test  = pd.read_csv(f"{processed_path}\\y_test.csv").squeeze()

print("Encoded shapes")
print("X_train_encoded:", X_train_encoded.shape)
print("X_test_encoded :", X_test_encoded.shape)
print("y_train        :", y_train.shape)
print("y_test         :", y_test.shape)

# Wrap in DataFrames (for consistency with sklearn linear models)
n_features_enc = X_train_encoded.shape[1]
enc_feature_names = [f"f{i}" for i in range(n_features_enc)]

X_train_enc_df = pd.DataFrame(X_train_encoded, columns=enc_feature_names)
X_test_enc_df  = pd.DataFrame(X_test_encoded,  columns=enc_feature_names)

y_train_arr = y_train.to_numpy()
y_test_arr  = y_test.to_numpy()

Encoded shapes
X_train_encoded: (1920, 14)
X_test_encoded : (600, 14)
y_train        : (1920,)
y_test         : (600,)


In [4]:

# ============================================
# 3. Prepare XGBoost features (unscaled numeric + OHE categoricals)
#    Recompute from raw train/test to keep notebook self-contained
# ============================================
raw_path = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\raw"

train_df_raw = pd.read_csv(fr"{raw_path}\train_df.csv")
test_df_raw  = pd.read_csv(fr"{raw_path}\test_df.csv")

target_col = "Happiness Score"
X_train_raw = train_df_raw.drop(columns=target_col).copy()
y_train_raw = train_df_raw[target_col].copy()

X_test_raw = test_df_raw.drop(columns=target_col).copy()
y_test_raw = test_df_raw[target_col].copy()

# Feature engineering for XGBoost
exercise_map = {"Low": 0.0, "Moderate": 0.5, "High": 1.0}
diet_map = {
    "Junk Food": 0.0,
    "Keto": 0.5,
    "Vegetarian": 0.7,
    "Vegan": 0.9,
    "Balanced": 1.0,
}
stress_map = {"High": 0.0, "Moderate": 0.5, "Low": 1.0}

def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Exercise_Num"] = df["Exercise Level"].map(exercise_map)
    df["Diet_Num"] = df["Diet Type"].map(diet_map)
    df["Stress_Num"] = df["Stress Level"].map(stress_map)

    df["Sleep_exercise"] = df["Sleep Hours"] * df["Exercise_Num"]
    df["Healthy_index"] = (df["Exercise_Num"] + df["Diet_Num"] + df["Stress_Num"]) / 3.0
    return df

X_train_fe = add_engineered_features(X_train_raw)
X_test_fe  = add_engineered_features(X_test_raw)

numeric_features_xgb = [
    "Sleep Hours",
    "Screen Time per Day (Hours)",
    "Work Hours per Week",
    "Social Interaction Score",
    "Sleep_exercise",
    "Healthy_index",
]

categorical_features_xgb = ["Exercise Level", "Diet Type", "Stress Level"]

ohe_only_transformer = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features_xgb),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features_xgb),
    ],
    remainder="drop",
)

X_train_xgb = ohe_only_transformer.fit_transform(X_train_fe)
X_test_xgb  = ohe_only_transformer.transform(X_test_fe)

X_train_xgb = X_train_xgb.astype("float32")
X_test_xgb  = X_test_xgb.astype("float32")

print("\nXGBoost feature shapes")
print("X_train_xgb:", X_train_xgb.shape)
print("X_test_xgb :", X_test_xgb.shape)

# Sanity check: y consistency
assert np.allclose(y_train_arr, y_train_raw.to_numpy())
assert np.allclose(y_test_arr, y_test_raw.to_numpy())



XGBoost feature shapes
X_train_xgb: (1920, 14)
X_test_xgb : (600, 14)


In [5]:
# ============================================
# 4. Define models
#    - Baseline (DummyRegressor)
#    - Ridge (on encoded features)
#    - XGBoost (on XGBoost feature set, best params from Optuna)
# ============================================
baseline_model = DummyRegressor(strategy="median")

ridge_model = Ridge(alpha=1.0)

xgb_best_params = {
    "n_estimators": 432,
    "max_depth": 3,
    "learning_rate": 0.011214720931319094,
    "subsample": 0.9091705575686873,
    "colsample_bytree": 0.7710206698824025,
    "min_child_weight": 10,
    "gamma": 4.637013156646108,
    "reg_alpha": 0.0022450836842314863,
    "reg_lambda": 5.305992936571093e-07,
    "random_state": 42,
    "n_jobs": -1,
    "tree_method": "hist",
    "objective": "reg:squarederror",
}
xgb_model = XGBRegressor(**xgb_best_params)


In [6]:
# ============================================
# 5. Fit and evaluate models
# ============================================
results = []

# Baseline on encoded features
baseline_model.fit(X_train_enc_df, y_train_arr)
y_train_pred = baseline_model.predict(X_train_enc_df)
y_test_pred  = baseline_model.predict(X_test_enc_df)

results.append({
    "model": "Baseline_DummyMedian",
    "mae_train": mean_absolute_error(y_train_arr, y_train_pred),
    "rmse_train": np.sqrt(mean_squared_error(y_train_arr, y_train_pred)),
    "r2_train": r2_score(y_train_arr, y_train_pred),
    "mae_test": mean_absolute_error(y_test_arr, y_test_pred),
    "rmse_test": np.sqrt(mean_squared_error(y_test_arr, y_test_pred)),
    "r2_test": r2_score(y_test_arr, y_test_pred),
})

# Ridge on encoded features
ridge_model.fit(X_train_enc_df, y_train_arr)
y_train_pred = ridge_model.predict(X_train_enc_df)
y_test_pred  = ridge_model.predict(X_test_enc_df)

results.append({
    "model": "Ridge_Alpha1",
    "mae_train": mean_absolute_error(y_train_arr, y_train_pred),
    "rmse_train": np.sqrt(mean_squared_error(y_train_arr, y_train_pred)),
    "r2_train": r2_score(y_train_arr, y_train_pred),
    "mae_test": mean_absolute_error(y_test_arr, y_test_pred),
    "rmse_test": np.sqrt(mean_squared_error(y_test_arr, y_test_pred)),
    "r2_test": r2_score(y_test_arr, y_test_pred),
})

# XGBoost on XGBoost feature set
xgb_model.fit(X_train_xgb, y_train_arr)
y_train_pred = xgb_model.predict(X_train_xgb)
y_test_pred  = xgb_model.predict(X_test_xgb)

results.append({
    "model": "XGBoost_OptunaBest",
    "mae_train": mean_absolute_error(y_train_arr, y_train_pred),
    "rmse_train": np.sqrt(mean_squared_error(y_train_arr, y_train_pred)),
    "r2_train": r2_score(y_train_arr, y_train_pred),
    "mae_test": mean_absolute_error(y_test_arr, y_test_pred),
    "rmse_test": np.sqrt(mean_squared_error(y_test_arr, y_test_pred)),
    "r2_test": r2_score(y_test_arr, y_test_pred),
})



In [7]:

# ============================================
# 6. Build comparison table
# ============================================
results_df = pd.DataFrame(results).set_index("model")

print("\nCombined model comparison (train vs test):")
print(results_df)


Combined model comparison (train vs test):
                      mae_train  rmse_train  r2_train  mae_test  rmse_test  \
model                                                                        
Baseline_DummyMedian   2.217656    2.560881 -0.000002  2.212333   2.553214   
Ridge_Alpha1           2.192331    2.541054  0.015422  2.227348   2.574835   
XGBoost_OptunaBest     2.080469    2.415781  0.110108  2.214780   2.570807   

                       r2_test  
model                           
Baseline_DummyMedian -0.000061  
Ridge_Alpha1         -0.017071  
XGBoost_OptunaBest   -0.013891  
