# Optuna Hyperparameter Tuning – Pruned XGBoost Model

This notebook performs automated hyperparameter tuning using Optuna on the pruned and regularized feature set derived in the previous notebook.

Objective: Minimize cross-validated RMSE on the training window (2019–2024).


In [None]:
#Libaries
import optuna
import numpy as np

from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import xgboost as xgb

## Preprocessing Pipeline for Reduced Feature Set


In [None]:
# Identify numeric & categorical columns for this reduced feature set
numeric_features_B = X_red.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_B = X_red.select_dtypes(include=["object", "category"]).columns.tolist()

def create_preprocessor_B():
    numeric_transformer_B = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer_B = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor_B = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer_B, numeric_features_B),
            ("cat", categorical_transformer_B, categorical_features_B),
        ]
    )
    return preprocessor_B


## Optuna Objective Function (3-Fold Cross-Validation)


In [None]:
def objective(trial):
    # ----- Hyperparameter search space -----
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
    }

    xgb_model = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        tree_method="hist",
        random_state=2025,
        **params
    )

    preprocessor_B = create_preprocessor_B()

    pipe = Pipeline([
        ("preprocess", preprocessor_B),
        ("model", xgb_model),
    ])

    # ----- 3-fold cross-validation -----
    kf = KFold(n_splits=3, shuffle=True, random_state=2025)

    rmses = []
    for train_idx, valid_idx in kf.split(X_red):
        X_train_cv, X_valid_cv = X_red.iloc[train_idx], X_red.iloc[valid_idx]
        y_train_cv, y_valid_cv = y_red.iloc[train_idx], y_red.iloc[valid_idx]

        pipe.fit(X_train_cv, y_train_cv)
        y_pred_cv = pipe.predict(X_valid_cv)

        mse = mean_squared_error(y_valid_cv, y_pred_cv)
        rmse = np.sqrt(mse)
        rmses.append(rmse)

    # Optuna will MINIMIZE the average RMSE
    return np.mean(rmses)

## Run Optuna Study


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=700, show_progress_bar=True)  # you can increase n_trials later

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)


## Final Optimized Model Construction


In [None]:
best_params = study.best_params
print(best_params)

best_xgb = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    random_state=2025,
    **best_params
)

preprocessor_B = create_preprocessor_B()

best_pipeline = Pipeline([
    ("preprocess", preprocessor_B),
    ("model", best_xgb),
])


## Optimized Pipeline Ready for Final Training

The pipeline constructed above will be used in the next notebook to train the final model and evaluate performance on the holdout test set.
