In [2]:
!python -V

Python 3.9.24


In [3]:
import pandas as pd

In [4]:
import pickle

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [7]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1761828359873, experiment_id='1', last_update_time=1761828359873, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
CATEGORICAL = ['PULocationID', 'DOLocationID']
NEEDED_COLS = CATEGORICAL + ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance']

def read_dataframe(filename):
    df = pd.read_parquet(filename, columns=NEEDED_COLS)

    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[CATEGORICAL] = df[CATEGORICAL].astype(str)
    return df


In [9]:
df_train = read_dataframe('/workspaces/mlops-zoomcamp/data/yellow_tripdata_2021-01.parquet')
df_val = read_dataframe('/workspaces/mlops-zoomcamp/data/yellow_tripdata_2021-02.parquet')

In [10]:
df_train.head()

Unnamed: 0,PULocationID,DOLocationID,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,duration
0,142,43,2021-01-01 00:30:10,2021-01-01 00:36:12,2.1,6.033333
2,132,165,2021-01-01 00:43:30,2021-01-01 01:11:06,14.7,27.6
3,138,132,2021-01-01 00:15:48,2021-01-01 00:31:01,10.6,15.216667
4,68,33,2021-01-01 00:31:49,2021-01-01 00:48:21,4.94,16.533333
5,224,68,2021-01-01 00:16:29,2021-01-01 00:24:30,1.6,8.016667


In [11]:
len(df_train), len(df_val)

(1343254, 1340859)

In [12]:
# Create a new feature 'PU_DO' that combines pickup and dropoff location IDs.
# This represents a specific route or trip pattern (e.g., pickup from zone 142 to dropoff at zone 43).
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [13]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [14]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [15]:
import numpy as np

In [16]:
# lr = LinearRegression()
# lr.fit(X_train, y_train)

# y_pred = lr.predict(X_val)

# np.sqrt(mean_squared_error(y_val, y_pred))

In [17]:
# with open('models/lin_reg.bin', 'wb') as f_out:
#     pickle.dump((dv, lr), f_out)

In [18]:
# # Start a new MLflow run — everything logged inside this block 
# # will be grouped under the same experiment run in the MLflow UI.
# with mlflow.start_run():

#     # Tag this run with metadata — useful for filtering or identifying runs later.
#     mlflow.set_tag("developer", "cristian")

#     # Log input data paths as parameters to keep track of which datasets were used for training and validation.
#     mlflow.log_param("train-data-path", "./data/yellow_tripdata_2021-01.parquet")
#     mlflow.log_param("valid-data-path", "./data/yellow_tripdata_2021-02.parquet")

#     # Define and log the model hyperparameter 'alpha' for the Lasso regression.
#     alpha = 0.1
#     mlflow.log_param("alpha", alpha)
    
#     # Initialize and train the Lasso regression model using the training data.
#     lr = Lasso(alpha)
#     lr.fit(X_train, y_train)

#     # Make predictions on the validation dataset.
#     y_pred = lr.predict(X_val)

#     # Calculate the Root Mean Squared Error (RMSE) to evaluate model performance.
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred))

#     # Log the RMSE metric so it appears in MLflow for comparison across runs.
#     mlflow.log_metric("rmse", rmse)

#     # Log the trained model file as an artifact — this saves the model binary in the MLflow run directory.
#     # 'artifact_path' defines the subfolder within the run's artifact storage.
#     mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [19]:
# # Check current tracking URI
# print("Tracking URI:", mlflow.get_tracking_uri())

# # List all experiments
# experiments = mlflow.search_experiments()
# for exp in experiments:
#     print(f"Experiment: {exp.name} (ID: {exp.experiment_id})")

# # List runs for a specific experiment
# runs = mlflow.search_runs(experiment_ids=["1"])
# print(runs[['run_id', 'metrics.rmse', 'params.alpha']].head())

## Hyperparameter Tuning with MLflow and Hyperopt

In [20]:
import xgboost as xgb

In [21]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [22]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [23]:
# Define the objective function that Hyperopt will minimize.
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Train the XGBoost model with the given parameters.
        # 'dtrain' is the training data matrix, 'num_boost_round' is the number of boosting rounds,
        # 'evals' is a list of tuples containing the validation data and a name for the evaluation,
        # 'early_stopping_rounds' is the number of rounds to wait before stopping if the validation score doesn't improve.
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=20
        )
        y_pred = booster.predict(valid)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [24]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 50, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

In [25]:
# # Use Hyperopt to find the best hyperparameters for the XGBoost model.
# best_result = fmin(
#     fn=objective,          # Objective function to minimize (returns validation RMSE)
#     space=search_space,    # The hyperparameter space defined above
#     algo=tpe.suggest,      # TPE algorithm: Bayesian optimizer that models p(x|y)
#     max_evals=10,          # Number of trials (iterations) to perform
#     trials=Trials()        # Object to store details of each run (params, loss, status)
# )

In [26]:
# Disable XGBoost autologging to manually control what gets logged to MLflow
# (autologging would otherwise record parameters, metrics, and models automatically)
mlflow.xgboost.autolog(disable=True)

In [27]:
from mlflow.models import infer_signature

with mlflow.start_run():
    
    # Convert NumPy arrays or DataFrames into XGBoost's optimized DMatrix format
    # This structure improves memory efficiency and training performance    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    # Define the best hyperparameters found from hyperparameter tuning (e.g., Hyperopt)
    # These control model complexity, learning rate, regularization, and random seed
    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    # Log all chosen hyperparameters to MLflow for reproducibility
    mlflow.log_params(best_params)

    # Train the XGBoost model using the defined parameters
    # - num_boost_round: maximum number of boosting iterations
    # - evals: list of evaluation datasets (train/validation) to track performance
    # - early_stopping_rounds: stop training if validation metric doesn’t improve for 20 rounds
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=20
    )

    # Make predictions on the validation dataset
    y_pred = booster.predict(valid)

    # Calculate the Root Mean Squared Error (RMSE) to evaluate model performance
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # Log the RMSE metric so it appears in MLflow for comparison across runs
    mlflow.log_metric("rmse", rmse)

    # Save the preprocessor (feature transformation model) as a pickle file
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    # Log the preprocessor as an artifact in MLflow
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    # The signature defines the input and output schema for the model
    signature = infer_signature(X_val, y_pred)

    # Log the trained XGBoost model in MLflow with signature and input example
    input_example = X_val[:3]
    mlflow.xgboost.log_model(
        booster, 
        artifact_path="models_mlflow",  # Path within the artifacts folder
        input_example=input_example
    )

[0]	validation-rmse:8.70473
[1]	validation-rmse:8.18719
[2]	validation-rmse:7.73507
[3]	validation-rmse:7.34101
[4]	validation-rmse:6.99911
[5]	validation-rmse:6.70309
[6]	validation-rmse:6.44805
[7]	validation-rmse:6.22863
[8]	validation-rmse:6.03993
[9]	validation-rmse:5.87898
[10]	validation-rmse:5.74031
[11]	validation-rmse:5.62193
[12]	validation-rmse:5.52015
[13]	validation-rmse:5.43335
[14]	validation-rmse:5.35946
[15]	validation-rmse:5.29515
[16]	validation-rmse:5.23961
[17]	validation-rmse:5.19222
[18]	validation-rmse:5.15073
[19]	validation-rmse:5.11507
[20]	validation-rmse:5.08359
[21]	validation-rmse:5.05622
[22]	validation-rmse:5.03200
[23]	validation-rmse:5.01159
[24]	validation-rmse:4.99262
[25]	validation-rmse:4.97618
[26]	validation-rmse:4.96099
[27]	validation-rmse:4.94774
[28]	validation-rmse:4.93420
[29]	validation-rmse:4.92215
[30]	validation-rmse:4.91162
[31]	validation-rmse:4.90200
[32]	validation-rmse:4.89302
[33]	validation-rmse:4.88474
[34]	validation-rmse:4.8



## Logging Model with Signature for Predictions

To see the "Make Predictions" section in MLflow UI, you need to log the model with a **signature** that defines input/output schema.


In [33]:
from mlflow.models import infer_signature

with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=20
    )

    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

    # Save the preprocessor
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    # === KEY DIFFERENCE: Infer signature from input/output ===
    # The signature defines the input and output schema for the model
    # This is what enables the "Make Predictions" section in MLflow UI
    signature = infer_signature(X_val, y_pred)

    # Log the model with BOTH signature and input_example
    # Convert sparse matrix to dense array for input_example to avoid serialization issues
    input_example = X_val[:3].toarray() if hasattr(X_val, 'toarray') else X_val[:3]
    mlflow.xgboost.log_model(
        booster, 
        artifact_path="models_mlflow",
        signature=signature,  # <-- This is the key!
        input_example=input_example
    )
    
    print(f"Run ID: {mlflow.active_run().info.run_id}")


[0]	validation-rmse:8.70473
[1]	validation-rmse:8.18719
[2]	validation-rmse:7.73507
[3]	validation-rmse:7.34101
[4]	validation-rmse:6.99911
[5]	validation-rmse:6.70309
[6]	validation-rmse:6.44805
[7]	validation-rmse:6.22863
[8]	validation-rmse:6.03993
[9]	validation-rmse:5.87898
[10]	validation-rmse:5.74031
[11]	validation-rmse:5.62193
[12]	validation-rmse:5.52015
[13]	validation-rmse:5.43335
[14]	validation-rmse:5.35946
[15]	validation-rmse:5.29515
[16]	validation-rmse:5.23961
[17]	validation-rmse:5.19222
[18]	validation-rmse:5.15073
[19]	validation-rmse:5.11507
[20]	validation-rmse:5.08359
[21]	validation-rmse:5.05622
[22]	validation-rmse:5.03200
[23]	validation-rmse:5.01159
[24]	validation-rmse:4.99262
[25]	validation-rmse:4.97618
[26]	validation-rmse:4.96099
[27]	validation-rmse:4.94774
[28]	validation-rmse:4.93420
[29]	validation-rmse:4.92215
[30]	validation-rmse:4.91162
[31]	validation-rmse:4.90200
[32]	validation-rmse:4.89302
[33]	validation-rmse:4.88474
[34]	validation-rmse:4.8



Run ID: 3f9865448f1540b69b325dc61ef5dba6


In [None]:
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
# from sklearn.svm import LinearSVR

# # Enable automatic MLflow logging for all scikit-learn models
# # This logs:
# #   - Model parameters (e.g., n_estimators, max_depth)
# #   - Evaluation metrics (e.g., RMSE)
# #   - Trained model artifacts (serialized .pkl files)
# #   - Model signature and environment info (for reproducibility)
# mlflow.sklearn.autolog()

# # Loop through multiple model classes to train and compare them easily
# # Each iteration will:
# #   1. Start a new MLflow run
# #   2. Train one model
# #   3. Evaluate it
# #   4. Log results to MLflow automatically
# for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

#     with mlflow.start_run():
        
#         # Log paths to the training and validation datasets for reproducibility
#         mlflow.log_param("train-data-path", "./data/green_tripdata_2022-01.parquet")
#         mlflow.log_param("valid-data-path", "./data/green_tripdata_2022-02.parquet")

#         # Log the preprocessor as an artifact in MLflow
#         mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

#         # Initialize and train the model
#         mlmodel = model_class()
#         mlmodel.fit(X_train, y_train)

#         # Make predictions on the validation dataset
#         y_pred = mlmodel.predict(X_val)
#         rmse = np.sqrt(mean_squared_error(y_val, y_pred))

#         # Log the RMSE metric to MLflow for comparison across runs
#         mlflow.log_metric("rmse", rmse)



## Loading the Model as a Native XGBoost Object: Making Predictions

In [None]:
import mlflow

# Define the model URI (Uniform Resource Identifier).
# This string points to a specific model stored in MLflow.
logged_model = 'runs:/712e9c4fb3294a75bd60b15f76102062/models_mlflow'

# Load the model using MLflow’s PyFunc interface/flavor.
# This loads the model as a generic Python function (PyFuncModel),
# which can make predictions on pandas DataFrames and other supported input types.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [30]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 712e9c4fb3294a75bd60b15f76102062

In [37]:
# Make predictions using RAW data (not DMatrix)
y_pred_pyfunc = loaded_model.predict(X_val)

# View predictions
print(y_pred_pyfunc[:10])

[11.896988   8.88484   21.050125  13.681334   4.0140877 10.455057
 41.32619   21.868849  10.650156   4.305702 ]


In [None]:
## Loads the same model, but using the XGBoost flavor.
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [32]:
xgboost_model

<xgboost.core.Booster at 0x77566dbc33d0>

In [33]:
y_pred = xgboost_model.predict(valid)

In [34]:
# check the first 10
y_pred[:10]

array([11.896988 ,  8.88484  , 21.050125 , 13.681334 ,  4.0140877,
       10.455057 , 41.32619  , 21.868849 , 10.650156 ,  4.305702 ],
      dtype=float32)