In [8]:
# ============================================
# 1. Imports
# ============================================
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

In [9]:

# ============================================
# 2. Load raw data
# ============================================
data_path_raw = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\raw"

train_df = pd.read_csv(fr"{data_path_raw}\train_df.csv")
test_df  = pd.read_csv(fr"{data_path_raw}\test_df.csv")

print("Train:", train_df.shape, "| Test:", test_df.shape)

Train: (1920, 12) | Test: (600, 12)


In [10]:
# ============================================
# 3. Separate target
# ============================================
target_col = "Happiness Score"

X_train = train_df.drop(columns=target_col).copy()
y_train = train_df[target_col].copy()

X_test = test_df.drop(columns=target_col).copy()
y_test = test_df[target_col].copy()

In [11]:

# ============================================
# 4. Feature engineering: Sleep_exercise + Healthy_index
# ============================================
exercise_map = {"Low": 0.0, "Moderate": 0.5, "High": 1.0}
diet_map = {
    "Junk Food": 0.0,
    "Keto": 0.5,
    "Vegetarian": 0.7,
    "Vegan": 0.9,
    "Balanced": 1.0,
}
stress_map = {"High": 0.0, "Moderate": 0.5, "Low": 1.0}

def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Exercise_Num"] = df["Exercise Level"].map(exercise_map)
    df["Diet_Num"] = df["Diet Type"].map(diet_map)
    df["Stress_Num"] = df["Stress Level"].map(stress_map)

    df["Sleep_exercise"] = df["Sleep Hours"] * df["Exercise_Num"]
    df["Healthy_index"] = (df["Exercise_Num"] + df["Diet_Num"] + df["Stress_Num"]) / 3.0
    return df

X_train_fe = add_engineered_features(X_train)
X_test_fe  = add_engineered_features(X_test)

print("Columns after feature engineering:")
print(X_train_fe.columns.tolist())


Columns after feature engineering:
['Country', 'Age', 'Gender', 'Exercise Level', 'Diet Type', 'Sleep Hours', 'Stress Level', 'Mental Health Condition', 'Work Hours per Week', 'Screen Time per Day (Hours)', 'Social Interaction Score', 'Exercise_Num', 'Diet_Num', 'Stress_Num', 'Sleep_exercise', 'Healthy_index']


In [12]:
# ============================================
# 5. One-hot encode categoricals (NO SCALING)
# ============================================
numeric_features_xgb = [
    "Sleep Hours",
    "Screen Time per Day (Hours)",
    "Work Hours per Week",
    "Social Interaction Score",
    "Sleep_exercise",
    "Healthy_index",
]

categorical_features_xgb = ["Exercise Level", "Diet Type", "Stress Level"]

ohe_only_transformer = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features_xgb),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features_xgb),
    ],
    remainder="drop",
)

X_train_xgb = ohe_only_transformer.fit_transform(X_train_fe)
X_test_xgb  = ohe_only_transformer.transform(X_test_fe)

X_train_xgb = X_train_xgb.astype("float32")
X_test_xgb  = X_test_xgb.astype("float32")

print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb  shape:", X_test_xgb.shape)


X_train_xgb shape: (1920, 14)
X_test_xgb  shape: (600, 14)


In [13]:

# ============================================
# 6. Final XGBoost model with best Optuna params
# ============================================
best_params = {
    "n_estimators": 432,
    "max_depth": 3,
    "learning_rate": 0.011214720931319094,
    "subsample": 0.9091705575686873,
    "colsample_bytree": 0.7710206698824025,
    "min_child_weight": 10,
    "gamma": 4.637013156646108,
    "reg_alpha": 0.0022450836842314863,
    "reg_lambda": 5.305992936571093e-07,
    "random_state": 42,
    "n_jobs": -1,
    "tree_method": "hist",
    "objective": "reg:squarederror",
}

final_model = XGBRegressor(**best_params)
final_model.fit(X_train_xgb, y_train)

# ============================================
# 7. Evaluate on TEST (final performance)
# ============================================
y_test_pred = final_model.predict(X_test_xgb)

mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print("\nFinal XGBoost Results (TEST set)")
print("MAE :", mae_test)
print("RMSE:", rmse_test)
print("R²  :", r2_test)



Final XGBoost Results (TEST set)
MAE : 2.214779945373535
RMSE: 2.5708066621486463
R²  : -0.013890756178241093


In [15]:

# ============================================
# 8. Save outputs
# ============================================
save_path_proc = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\processed"

np.save(f"{save_path_proc}\\X_train_xgb.npy", X_train_xgb)
np.save(f"{save_path_proc}\\X_test_xgb.npy", X_test_xgb)

pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred}).to_csv(
    f"{save_path_proc}\\y_test_preds_xgb.csv", index=False
)

print("\nSaved XGBoost data + predictions.")



Saved XGBoost data + predictions.
