In [1]:
!uv run python --version

Python 3.12.10


In [2]:
import pickle

import catppuccin
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import matplotlib as mpl
import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

  import pkg_resources


In [3]:
mpl.style.use(catppuccin.PALETTE.mocha.identifier)
%matplotlib inline

In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/12/15 16:34:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/15 16:34:33 INFO mlflow.store.db.utils: Updating database tables
2025/12/15 16:34:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/15 16:34:33 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/15 16:34:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/15 16:34:33 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='/home/ludo/Public/Projets_IA/MLOps_ZoomCamp/02-experiment-tracking/mlruns/1', creation_time=1765564082300, experiment_id='1', last_update_time=1765564082300, lifecycle_stage='active', name='nyc-taxi-experiment', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [5]:
# Jan 2021
# january2021 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet'
january2021 = './data/green_tripdata_2021-01.parquet'

# Feb 2021
# february2021 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet'
february2021 = './data/green_tripdata_2021-02.parquet'

In [6]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    mask = (df.duration >= 1) & (df.duration <= 60)
    df = df.loc[mask].assign(
        PULocationID=lambda d: d['PULocationID'].astype('string'),
        DOLocationID=lambda d: d['DOLocationID'].astype('string')
    )

    return df

In [7]:
df_train = read_dataframe(january2021)
df_val = read_dataframe(february2021)

In [None]:
len(df_train), len(df_val)

In [8]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [9]:
categorical = ['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
df_train.info()

In [10]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
# lr = LinearRegression()
# lr.fit(X_train, y_train)

# y_pred = lr.predict(X_val)

# root_mean_squared_error(y_val, y_pred)

In [None]:
# fig, ax = plt.subplots(figsize=(8, 6))
# sns.histplot(y_pred, label='prediction', kde=True, ax=ax)
# sns.histplot(y_train, label='actual', kde=True, ax=ax)
# ax.legend()
# plt.show()

In [None]:
# with open('models/lin_reg.bin', 'wb') as f_out:
#     pickle.dump((dv, lr), f_out)

In [None]:
# with mlflow.start_run():
#     alpha = 0.1

#     mlflow.set_tag("developper", "ludo")
#     mlflow.log_param("train-data-path", january2021)
#     mlflow.log_param("valid-data-path", february2021)

#     mlflow.log_param("alpha", alpha)
#     lr = Lasso(alpha)
#     lr.fit(X_train, y_train)

#     y_pred = lr.predict(X_val)
#     rmse = root_mean_squared_error(y_val, y_pred)
#     mlflow.log_metric("rmse", rmse)

#     mlflow.log_artifact(
#         local_path="models/lin_reg.bin",
#         artifact_path="models_pickle/",
#     )

#     print(f"{lr} â†’ {rmse=:.2f}")

In [11]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
# def objective(params):
#     with mlflow.start_run():
#         mlflow.set_tag("model", "xgboost")
#         mlflow.log_params(params)
#         booster = xgb.train(
#             params=params,
#             dtrain=train,
#             num_boost_round=1000,
#             evals=[(valid, 'validation')],
#             early_stopping_rounds=50
#         )
#         y_pred = booster.predict(valid)
#         rmse = root_mean_squared_error(y_val, y_pred)
#         mlflow.log_metric("rmse", rmse)

#     return {'loss': rmse, 'status': STATUS_OK}

In [None]:
# search_space = {
#     'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
#     'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
#     'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
#     'objective': 'reg:linear',
#     'seed': 42,
# }

In [None]:
# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

In [12]:
best_params = dict(
    learning_rate=0.11003124365651852,
    max_depth=69,
    min_child_weight=1.3037172702760245,
    objective="reg:linear",
    reg_alpha=0.06605109955963756,
    reg_lambda=0.34862327720948777,
    seed=42
)

In [None]:
# mlflow.xgboost.autolog()
# booster = xgb.train(
#     params=best_params,
#     dtrain=train,
#     num_boost_round=1000,
#     evals=[(valid, 'validation')],
#     early_stopping_rounds=50,
# )

In [16]:
with mlflow.start_run():
    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50,
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, name="models_mlflow")
    run_id = mlflow.active_run().info.run_id

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:11.31828
[1]	validation-rmse:10.54834
[2]	validation-rmse:9.89274
[3]	validation-rmse:9.33546
[4]	validation-rmse:8.86674
[5]	validation-rmse:8.46925
[6]	validation-rmse:8.13982
[7]	validation-rmse:7.86014
[8]	validation-rmse:7.63159
[9]	validation-rmse:7.43854
[10]	validation-rmse:7.27795
[11]	validation-rmse:7.14613
[12]	validation-rmse:7.03560
[13]	validation-rmse:6.94464
[14]	validation-rmse:6.86734
[15]	validation-rmse:6.80527
[16]	validation-rmse:6.75230
[17]	validation-rmse:6.70729
[18]	validation-rmse:6.67050
[19]	validation-rmse:6.63864
[20]	validation-rmse:6.61298
[21]	validation-rmse:6.59038
[22]	validation-rmse:6.56887
[23]	validation-rmse:6.55207
[24]	validation-rmse:6.53656
[25]	validation-rmse:6.52211
[26]	validation-rmse:6.51043
[27]	validation-rmse:6.50008
[28]	validation-rmse:6.49207
[29]	validation-rmse:6.48401
[30]	validation-rmse:6.47751
[31]	validation-rmse:6.47171
[32]	validation-rmse:6.46612
[33]	validation-rmse:6.46015
[34]	validation-rmse:6



In [27]:
# run_id = "4650c9a8af3e49d3aad80452a95ff93a"
# Load the model from the tracking server and perform inference
loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/models_mlflow")

print(loaded_model)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: /home/ludo/Public/Projets_IA/MLOps_ZoomCamp/02-experiment-tracking/mlruns/1/models/m-5429961bdc47465fa70cb5401fc1b3c3/artifacts
  flavor: mlflow.xgboost
  run_id: 4650c9a8af3e49d3aad80452a95ff93a



In [28]:
# Convert DMatrix to Pandas DataFrame
data = pd.DataFrame(valid.get_data().toarray())#, columns=[f'feature_{i}' for i in range(5)])
loaded_model.predict(data)

array([82.937614, 66.0882  , 74.3409  , ..., 97.70136 , 66.0882  ,
       66.18339 ], shape=(61921,), dtype=float32)

In [30]:
xgboost_model = mlflow.xgboost.load_model(f"runs:/{run_id}/models_mlflow")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
xgboost_model

<xgboost.core.Booster at 0x7f1ab0ad7320>

In [32]:
y_pred = xgboost_model.predict(valid)

In [33]:
y_pred[:10]

array([14.07166  ,  6.844341 , 13.6745825, 24.444021 ,  9.200103 ,
       17.146769 , 10.754836 ,  8.060489 ,  9.561232 , 18.993908 ],
      dtype=float32)