In [13]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
import mlflow
import mlflow.xgboost

In [14]:
#import sys, xgboost as xgb
#print(sys.executable)        
#print(xgb.__version__)     
#print(xgb.__file__)         

## Loading Datasets

In [21]:
train_df = pd.read_csv("../data/processed/train_fe.csv")
val_df = pd.read_csv("../data/processed/val_fe.csv")

target = "price"
x_train = train_df.drop(columns=[target])
y_train = train_df[target]
x_val = val_df.drop(columns=[target])
y_val = val_df[target]

## Hyperparametertuning on the xgboost alogorithm

In [16]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }
    
    with mlflow.start_run():
        # Train the model
        model = XGBRegressor(**params)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        # Evaluate the model
        rmse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        # Log parameters and metrics to MLflow
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
        
        return mae

In [17]:
# Force MLflow to always use the root project mlruns folder
mlflow.set_tracking_uri(R"../mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)

[I 2025-11-14 19:17:54,151] A new study created in memory with name: no-name-f4a29568-0a9d-4632-aeb3-a333a67fb141
[I 2025-11-14 19:18:14,181] Trial 0 finished with value: 33763.297872346964 and parameters: {'n_estimators': 635, 'max_depth': 7, 'learning_rate': 0.21390188711524002, 'subsample': 0.8453805116332027, 'colsample_bytree': 0.5184616404852285, 'min_child_weight': 4, 'gamma': 0.7926847632515838, 'reg_alpha': 2.695521781386684e-08, 'reg_lambda': 0.008564557788379529}. Best is trial 0 with value: 33763.297872346964.
[I 2025-11-14 19:18:30,466] Trial 1 finished with value: 32930.02844735159 and parameters: {'n_estimators': 460, 'max_depth': 6, 'learning_rate': 0.05960676405524866, 'subsample': 0.5475112753470821, 'colsample_bytree': 0.711990596581848, 'min_child_weight': 4, 'gamma': 2.4290958864233234, 'reg_alpha': 0.019191977370647994, 'reg_lambda': 0.0003166663797957635}. Best is trial 1 with value: 32930.02844735159.
[I 2025-11-14 19:19:05,596] Trial 2 finished with value: 3223

Best params: {'n_estimators': 991, 'max_depth': 9, 'learning_rate': 0.023991526368951355, 'subsample': 0.680945671730209, 'colsample_bytree': 0.9745226692964004, 'min_child_weight': 7, 'gamma': 3.8769230844534013, 'reg_alpha': 9.577996336599302, 'reg_lambda': 4.420484791413238e-08}


## Fitting best Model

In [None]:
print("Best trial:", study.best_trial)
print("Number of finished trials:", len(study.trials))

print("Best MAE:", study.best_value)
# Fitting best Model
best_params = study.best_trial.params
print("Training best model with params:", best_params)

Best trial: FrozenTrial(number=13, state=1, values=[30703.453509694627], datetime_start=datetime.datetime(2025, 11, 14, 19, 23, 38, 161752), datetime_complete=datetime.datetime(2025, 11, 14, 19, 24, 34, 912586), params={'n_estimators': 991, 'max_depth': 9, 'learning_rate': 0.023991526368951355, 'subsample': 0.680945671730209, 'colsample_bytree': 0.9745226692964004, 'min_child_weight': 7, 'gamma': 3.8769230844534013, 'reg_alpha': 9.577996336599302, 'reg_lambda': 4.420484791413238e-08}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=200, step=1), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 

## Registering the best model to MLflow Model Registry

In [19]:
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log final model
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, name="model",
        registered_model_name="housing_xgboost_model" )

Final tuned model performance:
MAE: 30918.97183521038
RMSE: 75106.29011173922
R²: 0.9563977137984534


  self.get_booster().save_model(fname)
Registered model 'housing_xgboost_model' already exists. Creating a new version of this model...
Created version '2' of model 'housing_xgboost_model'.


## Loading best model to MLflow Model Registry

In [23]:
# Load the model for inference

model_uri = "models:/housing_xgboost_model/2"
loaded_model = mlflow.xgboost.load_model(model_uri)

preds = loaded_model.predict(x_val)

In [24]:
print("Made predictions:", preds[:5])

Made predictions: [ 168306.12  136872.55 1577454.2   502785.22  194170.83]
