In [11]:
import pandas as pd
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [2]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

## what is this ?
1. xgboost module is imported to use the XGBoost model. see "XGBoost.ipynb" where i tried to make a simple regression model
2. hyperopt:It's a library that used some **bayesian methods** to try to find the best set of hyperparameters 
   In terms of hyperopt your model will be just a function that takes some input(combination of hyper parameters) and outputs some values. that function called ***objective function*** 
   1. fmin: Runs the optmization loop to find the best hyperparameter set. (the hp combination that produces the lowest RMSE)
      Used to minimize the objective function(outputs of the model), it depends on the evaluation metric you may need the maximum output.
   2. tpe: The algorithm used to pick the next best parameters to try (TPE = Tree-structured Parzen Estimator)
   3. hp: library that contains a bunch of different methods to define the search space (the ranges of each hyperparameter that we will use).
   4. STATUS_OK: signal that we will send at the end of each run to tell hyperopt that the objective function has run successfully.
   5. Trials: will keep track of the information of each run
3. scope: used to cast values inside the **search space** to integer, float, etc.

summery:
trying different hyper-parameter set generated by the **hp** module, each new set is selected using **tpe** algorithm. the **fmin** is used to run the objective function with different parameter set and selects the parameter set that produces the minimum RMSE. So from its name it tries to minimize the RMSE by running the model again and again with different hyper-parameter set.

In [3]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df_train = read_dataframe('../data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2021-02.parquet')

In [5]:
len(df_train), len(df_val)

(73908, 61921)

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID'
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [15]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/MLOps-zoomcamp/02. Experiment Tracking/mlruns/1', creation_time=1749581082720, experiment_id='1', last_update_time=1749581082720, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [10]:
# DMatrix is an optimized data structure used by XGBoost to:
# Store your features (X_train, X_val) and labels (y_train, y_val)
# Speed up training (it’s faster than using raw NumPy arrays or Pandas)
# Handle things like missing values, sparse data, and caching internally

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:

"""
the training works as follows: 
    Start training (up to 1000 rounds)
    After each round:
        Train a new tree
        Evaluate on the validation set
        Check if performance improved
    If it stops improving for 50 rounds in a row → training stops early
    The model automatically keeps the best version it saw during training
"""
def objective(params: dict) -> dict:
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100, # Try training up to 1000 trees (boosting rounds)
            evals=[(valid, 'validation')], # While training, also evaluate the model on this validation set
            early_stopping_rounds=20 # If the validation score doesn't improve for 50 rounds, stop training early
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [12]:
"""
hp.quniform => from 4 to 100 step 1
"""
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    # this tells the xgboost training algorithm to use RMSE metric in validation after each round
    # this indicates when to do early stopping to avoid overfitting and wasting time
    'objective': 'reg:squarederror',
    # For predictability
    'seed': 42
}

"""
the fn parameter is the objective function that is responsible for running the model
it's expected to take the params as input
and return a dictionary contains two keys "loss" and  "status"
"""
best_result = fmin(
    fn=objective,
    space=search_space,
    # This tells Hyperopt how to choose the next set of parameters. tpe (Tree-structured Parzen Estimator) is a smart algorithm that learns from past trials to pick better values next time.
    algo=tpe.suggest,
    # number of runs, each run with different Hypter-parameter set
    max_evals=10,
    trials=Trials()
)

NameError: name 'objective' is not defined

In [None]:
# best hyper parameter combination that makes the lowest loss
best_result

{'learning_rate': np.float64(0.06795566766046571),
 'max_depth': np.float64(74.0),
 'min_child_weight': np.float64(1.1034760099449035),
 'reg_alpha': np.float64(0.08418429054929681),
 'reg_lambda': np.float64(0.007240669500118009)}

### MLFlow auto logging

I will try to train a new model with best hyper-parameters obtained from the previous step using mlflow.autolog

Autolog works only for specific frameworks one of them is XGBoost

In [13]:
best_hps = {
  'learning_rate': 0.06795566766046571,
  'max_depth': 74,
  'min_child_weight': 1.1034760099449035,
  'reg_alpha': 0.08418429054929681,
  'reg_lambda': 0.007240669500118009,
  'objective': 'reg:squarederror',
  'seed': 42
}

In [None]:
mlflow.xgboost.autolog()

booster = xgb.train(
  params=best_hps,
  dtrain=train,
  num_boost_round=1000,
  evals=[(valid, "validation")],
  early_stopping_rounds=50
)

2025/06/11 04:17:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '374d05143647477991b1e1c99275b53f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:11.65641
[1]	validation-rmse:11.14813
[2]	validation-rmse:10.68574
[3]	validation-rmse:10.26421
[4]	validation-rmse:9.88213
[5]	validation-rmse:9.53492
[6]	validation-rmse:9.22268
[7]	validation-rmse:8.93914
[8]	validation-rmse:8.68309
[9]	validation-rmse:8.45363
[10]	validation-rmse:8.24733
[11]	validation-rmse:8.06091
[12]	validation-rmse:7.89570
[13]	validation-rmse:7.74545
[14]	validation-rmse:7.61146
[15]	validation-rmse:7.49190
[16]	validation-rmse:7.38524
[17]	validation-rmse:7.29063
[18]	validation-rmse:7.20510
[19]	validation-rmse:7.12936
[20]	validation-rmse:7.06096
[21]	validation-rmse:7.00094
[22]	validation-rmse:6.94597
[23]	validation-rmse:6.89824
[24]	validation-rmse:6.85446
[25]	validation-rmse:6.81548
[26]	validation-rmse:6.78070
[27]	validation-rmse:6.74898
[28]	validation-rmse:6.71989
[29]	validation-rmse:6.69555
[30]	validation-rmse:6.67299
[31]	validation-rmse:6.65151
[32]	validation-rmse:6.63385
[33]	validation-rmse:6.61697
[34]	validation-rmse



### Saving and Loading the model to make predictions

#### using mlflow.log_artifact()

Needs the model to be saved previously on the local system, then move it to mlflow artifacts \
To run the model again you need to download it from the artifacts and then load it 

In [18]:
import pickle
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run():
  mlflow.set_tag("developer", "kamal")
  
  mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.parquet")
  mlflow.log_param("validation-data-path", "../data/green_tripdata_2021-02.parquet")

  mlflow.log_params(best_hps)

  booster = xgb.train(
    params=best_hps,
    dtrain=train,
    num_boost_round=10,
    evals=[(valid, "validation")],
    early_stopping_rounds=50
  )

  y_valid_predictions = booster.predict(valid)
  rmse = root_mean_squared_error(y_true=y_val, y_pred=y_valid_predictions)

  mlflow.log_metric("rmse", rmse)

  with open("models/booster.bin", "wb") as model_file:
    pickle.dump(booster, model_file)

  mlflow.log_artifact(local_path="models/booster.bin", artifact_path="pickle_models")

[0]	validation-rmse:11.65641
[1]	validation-rmse:11.14813
[2]	validation-rmse:10.68574
[3]	validation-rmse:10.26421
[4]	validation-rmse:9.88213
[5]	validation-rmse:9.53492
[6]	validation-rmse:9.22268
[7]	validation-rmse:8.93914
[8]	validation-rmse:8.68309
[9]	validation-rmse:8.45363


In [19]:
with open("models/booster.bin", "rb") as booster_file:
  loaded_booster = pickle.load(booster_file)

temp = loaded_booster.predict(valid)

In [20]:
temp[:10]

array([15.629837, 11.955143, 16.33734 , 20.584774, 13.196257, 16.97349 ,
       15.429887, 12.681261, 13.089675, 18.735954], dtype=float32)

#### Using mlflow.xgboost.log_model()

This provides a more reliable way to load and deploy the trained model 

In [22]:
import mlflow.xgboost


mlflow.xgboost.autolog(disable=True)

with mlflow.start_run(run_name="saving_using_mlflow.xgboost.log_model()_with_preprocessor"):
  mlflow.set_tag("developer", "kamal")
  
  mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.parquet")
  mlflow.log_param("validation-data-path", "../data/green_tripdata_2021-02.parquet")

  mlflow.log_params(best_hps)

  booster = xgb.train(
    params=best_hps,
    dtrain=train,
    num_boost_round=10,
    evals=[(valid, "validation")],
    early_stopping_rounds=50
  )

  y_valid_predictions = booster.predict(valid)
  rmse = root_mean_squared_error(y_true=y_val, y_pred=y_valid_predictions)

  mlflow.log_metric("rmse", rmse)

  with open("models/preprocessor.bin", "wb") as preprocessor_file:
    pickle.dump(dv, preprocessor_file)

  mlflow.log_artifact("models/preprocessor.bin", "preprocessors")
  mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")


[0]	validation-rmse:11.65641
[1]	validation-rmse:11.14813
[2]	validation-rmse:10.68574
[3]	validation-rmse:10.26421
[4]	validation-rmse:9.88213
[5]	validation-rmse:9.53492
[6]	validation-rmse:9.22268
[7]	validation-rmse:8.93914
[8]	validation-rmse:8.68309
[9]	validation-rmse:8.45363




### Loading Model from artifacts and make predictions

#### Loading the model as a python function flavour

In [34]:
logged_model = 'runs:/ab3f2144e5b5471fa60f5ba979ae2a04/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [35]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: ab3f2144e5b5471fa60f5ba979ae2a04

In [38]:
loaded_model.predict(X_val)[:3]

array([15.629837, 11.955143, 16.33734 ], dtype=float32)

#### Loading the model as XGBoost object

In [39]:
logged_model = 'runs:/ab3f2144e5b5471fa60f5ba979ae2a04/models_mlflow'

loaded_model = mlflow.xgboost.load_model(logged_model)

In [40]:
loaded_model

<xgboost.core.Booster at 0x77e00d2313a0>

In [41]:
loaded_model.predict(valid)[:3]

array([15.629837, 11.955143, 16.33734 ], dtype=float32)