In [7]:
# ==============================================
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
data_path = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\raw"

train_df = pd.read_csv(fr"{data_path}\train_df.csv")
test_df = pd.read_csv(fr"{data_path}\test_df.csv")
eval_df = pd.read_csv(fr"{data_path}\eval_df.csv")

print("Train:", train_df.shape, "| Test:", test_df.shape, "| Eval:", eval_df.shape)

target_col = "Happiness Score"

X_train = train_df.drop(columns=target_col).copy()
y_train = train_df[target_col].copy()

X_test = test_df.drop(columns=target_col).copy()
y_test = test_df[target_col].copy()


Train: (1920, 12) | Test: (600, 12) | Eval: (480, 12)


In [13]:
exercise_map = {"Low": 0.0, "Moderate": 0.5, "High": 1.0}
diet_map = {
    "Junk Food": 0.0,
    "Keto": 0.5,
    "Vegetarian": 0.7,
    "Vegan": 0.9,
    "Balanced": 1.0,
}
stress_map = {"High": 0.0, "Moderate": 0.5, "Low": 1.0}

def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Exercise_Num"] = df["Exercise Level"].map(exercise_map)
    df["Diet_Num"] = df["Diet Type"].map(diet_map)
    df["Stress_Num"] = df["Stress Level"].map(stress_map)

    df["Sleep_exercise"] = df["Sleep Hours"] * df["Exercise_Num"]
    df["Healthy_index"] = (df["Exercise_Num"] + df["Diet_Num"] + df["Stress_Num"]) / 3.0
    return df

X_train_fe = add_engineered_features(X_train)
X_test_fe = add_engineered_features(X_test)


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_features_xgb = [
    "Sleep Hours",
    "Screen Time per Day (Hours)",
    "Work Hours per Week",
    "Social Interaction Score",
    "Sleep_exercise",
    "Healthy_index",
]

categorical_features_xgb = ["Exercise Level", "Diet Type", "Stress Level"]

ohe_only_transformer = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features_xgb),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features_xgb),
    ],
    remainder="drop",
)

X_train_xgb = ohe_only_transformer.fit_transform(X_train_fe)
X_test_xgb = ohe_only_transformer.transform(X_test_fe)

X_train_xgb = X_train_xgb.astype("float32")
X_test_xgb = X_test_xgb.astype("float32")

print("X_train_xgb shape:", X_train_xgb.shape)
print("X_test_xgb  shape:", X_test_xgb.shape)


X_train_xgb shape: (1920, 14)
X_test_xgb  shape: (600, 14)


In [17]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
        "objective": "reg:squarederror",
    }

    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)

        # Train on TRAIN
        model.fit(X_train_xgb, y_train)

        # Validate on TEST
        y_pred = model.predict(X_test_xgb)

        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae  = float(mean_absolute_error(y_test, y_pred))
        r2   = float(r2_score(y_test, y_pred))

        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse


In [18]:
# ============================================
# 7. Run Optuna study with MLflow
# ============================================
mlflow_path = r"C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\mlflow"

mlflow.set_tracking_uri(f"file:///{mlflow_path}")
mlflow.set_experiment("xgboost_optuna_ACIT4510_statistical_learning_project")

study = optuna.create_study(
    direction="minimize",
    study_name="xgb_happiness_optuna_testset"
)
study.optimize(objective, n_trials=15)

print("\n====================")
print("Best RMSE:", study.best_value)
print("Best parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")
print("====================")

[I 2025-11-16 00:58:45,751] A new study created in memory with name: xgb_happiness_optuna_testset
[I 2025-11-16 00:58:46,451] Trial 0 finished with value: 2.865516031271365 and parameters: {'n_estimators': 758, 'max_depth': 5, 'learning_rate': 0.06613216171036432, 'subsample': 0.6338707668470208, 'colsample_bytree': 0.6813336471522998, 'min_child_weight': 3, 'gamma': 1.8843892715888981, 'reg_alpha': 6.813824749654972e-07, 'reg_lambda': 2.1455406366811024e-05}. Best is trial 0 with value: 2.865516031271365.
[I 2025-11-16 00:58:47,107] Trial 1 finished with value: 2.7776298924614657 and parameters: {'n_estimators': 810, 'max_depth': 6, 'learning_rate': 0.04484319350612686, 'subsample': 0.6858224152712168, 'colsample_bytree': 0.6896249760755115, 'min_child_weight': 9, 'gamma': 1.270049489993691, 'reg_alpha': 5.107827076840978, 'reg_lambda': 0.00019296230726206483}. Best is trial 1 with value: 2.7776298924614657.
[I 2025-11-16 00:58:47,583] Trial 2 finished with value: 2.6974948036361055 a


Best RMSE: 2.5708066621486463
Best parameters:
  n_estimators: 432
  max_depth: 3
  learning_rate: 0.011214720931319094
  subsample: 0.9091705575686873
  colsample_bytree: 0.7710206698824025
  min_child_weight: 10
  gamma: 4.637013156646108
  reg_alpha: 0.0022450836842314863
  reg_lambda: 5.305992936571093e-07
