In [12]:
import os, mlflow
from dotenv import load_dotenv

load_dotenv(override=True)  
EXPERIMENT_NAME = "/Users/diegooctavioperez21@gmail.com/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

In [5]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [6]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [7]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

In [9]:
import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

In [10]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [13]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-3), 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha",   math.exp(-5), math.exp(-1), log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", math.exp(-6), math.exp(-1), log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", math.exp(-1), math.exp(3), log=True),
        "objective": "reg:squarederror",  
        "seed": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "xgboost")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con early stopping en el conjunto de validación
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=10,
        )

        # Predicción y métrica en validación
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.xgboost.log_model(
            booster,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimiza el valor retornado
    return rmse

In [14]:
mlflow.xgboost.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="XGBoost Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "xgboost",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=10,
    )

    # Evaluar y registrar la métrica final en validación
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.xgboost.log_model(
        booster,
        name="model",
        input_example=input_example,
        signature=signature
    )

[I 2025-10-28 20:18:20,689] A new study created in memory with name: no-name-8417fa9f-ed25-4225-abaf-0e7eff624d88


[0]	validation-rmse:5.98960
[1]	validation-rmse:5.92413
[2]	validation-rmse:5.92287
[3]	validation-rmse:5.92324
[4]	validation-rmse:5.92483
[5]	validation-rmse:5.92489
[6]	validation-rmse:5.92498
[7]	validation-rmse:5.92343
[8]	validation-rmse:5.92347
[9]	validation-rmse:5.92467
[10]	validation-rmse:5.92456
[11]	validation-rmse:5.92460


  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
2025/10/28 20:18:28 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 20:18:32,000] Trial 0 finished with value: 5.924581902007077 and parameters: {'max_depth': 40, 'learning_rate': 0.8625543817410922, 'reg_alpha': 0.12593061066249622, 'reg_lambda': 0.049454235173237264, 'min_child_weight': 0.6866535292359801}. Best is trial 0 with value: 5.924581902007077.


🏃 View run victorious-stoat-433 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/acb488b97f5c438fbec7763d9fa858f7
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.79510
[1]	validation-rmse:8.50586
[2]	validation-rmse:8.24215
[3]	validation-rmse:8.00493
[4]	validation-rmse:7.78830
[5]	validation-rmse:7.59575
[6]	validation-rmse:7.41983
[7]	validation-rmse:7.26129
[8]	validation-rmse:7.11270
[9]	validation-rmse:6.98583
[10]	validation-rmse:6.87056
[11]	validation-rmse:6.76533
[12]	validation-rmse:6.67029
[13]	validation-rmse:6.58378
[14]	validation-rmse:6.50773
[15]	validation-rmse:6.43768
[16]	validation-rmse:6.37633
[17]	validation-rmse:6.32116
[18]	validation-rmse:6.27142
[19]	validation-rmse:6.22765
[20]	validation-rmse:6.18963
[21]	validation-rmse:6.15802
[22]	validation-rmse:6.12794
[23]	validation-rmse:6.10295
[24]	validation-rmse:6.07940
[25]	validation-rmse:6.05863
[2

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:18:45,067] Trial 1 finished with value: 5.902611653559419 and parameters: {'max_depth': 19, 'learning_rate': 0.059264241587996896, 'reg_alpha': 0.21539205131792016, 'reg_lambda': 0.05006540936006931, 'min_child_weight': 6.248180561354165}. Best is trial 1 with value: 5.902611653559419.


🏃 View run sneaky-colt-761 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/e798fce2fbf04087b3c64895ff1ab00c
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:5.90396
[1]	validation-rmse:5.88901
[2]	validation-rmse:5.88875
[3]	validation-rmse:5.88908
[4]	validation-rmse:5.88973
[5]	validation-rmse:5.88241
[6]	validation-rmse:5.88348
[7]	validation-rmse:5.88292
[8]	validation-rmse:5.87483
[9]	validation-rmse:5.87457
[10]	validation-rmse:5.87441
[11]	validation-rmse:5.87453
[12]	validation-rmse:5.87449
[13]	validation-rmse:5.87444
[14]	validation-rmse:5.87358
[15]	validation-rmse:5.87331
[16]	validation-rmse:5.87310
[17]	validation-rmse:5.87302
[18]	validation-rmse:5.87068
[19]	validation-rmse:5.87074
[20]	validation-rmse:5.87061
[21]	validation-rmse:5.87062
[22]	validation-rmse:5.87063
[23]	validation-rmse:5.87068
[24]	validation-rmse:5.87073
[25]	validation-rmse:5.87077
[26]	va

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:18:52,342] Trial 2 finished with value: 5.873624736518469 and parameters: {'max_depth': 5, 'learning_rate': 0.9136840519292246, 'reg_alpha': 0.18820387978911576, 'reg_lambda': 0.007166739666045858, 'min_child_weight': 0.7613210498541186}. Best is trial 2 with value: 5.873624736518469.


🏃 View run magnificent-cod-404 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/950a59a2c884442baa8b21360f8070ab
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.46241
[1]	validation-rmse:7.91472
[2]	validation-rmse:7.48600
[3]	validation-rmse:7.13502
[4]	validation-rmse:6.85854
[5]	validation-rmse:6.63463
[6]	validation-rmse:6.46571
[7]	validation-rmse:6.33570
[8]	validation-rmse:6.22592
[9]	validation-rmse:6.14853
[10]	validation-rmse:6.08544
[11]	validation-rmse:6.02964
[12]	validation-rmse:5.99678
[13]	validation-rmse:5.96951
[14]	validation-rmse:5.94694
[15]	validation-rmse:5.92906
[16]	validation-rmse:5.91642
[17]	validation-rmse:5.90839
[18]	validation-rmse:5.90074
[19]	validation-rmse:5.89693
[20]	validation-rmse:5.89365
[21]	validation-rmse:5.88900
[22]	validation-rmse:5.88617
[23]	validation-rmse:5.88358
[24]	validation-rmse:5.88146
[25]	validation-rmse:5.87956
[26

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:19:06,973] Trial 3 finished with value: 5.864844695204969 and parameters: {'max_depth': 21, 'learning_rate': 0.12402485733085497, 'reg_alpha': 0.054969638498598095, 'reg_lambda': 0.02148769342025257, 'min_child_weight': 1.1792947151892554}. Best is trial 3 with value: 5.864844695204969.


🏃 View run dashing-owl-514 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/01d0d0f7dfb048b796a16d1ffc77627e
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.71345
[1]	validation-rmse:8.36334
[2]	validation-rmse:8.04501
[3]	validation-rmse:7.76755
[4]	validation-rmse:7.52872
[5]	validation-rmse:7.31340
[6]	validation-rmse:7.12851
[7]	validation-rmse:6.97028
[8]	validation-rmse:6.84009
[9]	validation-rmse:6.72166
[10]	validation-rmse:6.62237
[11]	validation-rmse:6.53410
[12]	validation-rmse:6.45743
[13]	validation-rmse:6.39329
[14]	validation-rmse:6.34602
[15]	validation-rmse:6.30134
[16]	validation-rmse:6.26784
[17]	validation-rmse:6.23305
[18]	validation-rmse:6.20800
[19]	validation-rmse:6.18572
[20]	validation-rmse:6.16471
[21]	validation-rmse:6.14918
[22]	validation-rmse:6.13697
[23]	validation-rmse:6.12587
[24]	validation-rmse:6.11877
[25]	validation-rmse:6.11067
[26]	va

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:19:38,052] Trial 4 finished with value: 6.073036521827848 and parameters: {'max_depth': 63, 'learning_rate': 0.07565903471570516, 'reg_alpha': 0.021678779375600917, 'reg_lambda': 0.015480241912324163, 'min_child_weight': 2.2802382585441565}. Best is trial 3 with value: 5.864844695204969.


🏃 View run peaceful-sloth-572 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/b0419f99d228467c9c851fe3119a646c
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.59780
[1]	validation-rmse:8.16119
[2]	validation-rmse:7.79831
[3]	validation-rmse:7.48799
[4]	validation-rmse:7.22840
[5]	validation-rmse:7.00633
[6]	validation-rmse:6.83299
[7]	validation-rmse:6.68783
[8]	validation-rmse:6.55593
[9]	validation-rmse:6.45251
[10]	validation-rmse:6.37103
[11]	validation-rmse:6.30053
[12]	validation-rmse:6.23849
[13]	validation-rmse:6.19291
[14]	validation-rmse:6.15025
[15]	validation-rmse:6.12218
[16]	validation-rmse:6.09826
[17]	validation-rmse:6.08039
[18]	validation-rmse:6.06759
[19]	validation-rmse:6.05658
[20]	validation-rmse:6.05018
[21]	validation-rmse:6.04127
[22]	validation-rmse:6.03601
[23]	validation-rmse:6.03347
[24]	validation-rmse:6.02894
[25]	validation-rmse:6.02623
[26]

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:20:22,824] Trial 5 finished with value: 6.036797300854083 and parameters: {'max_depth': 80, 'learning_rate': 0.09062921527362071, 'reg_alpha': 0.05270408847118816, 'reg_lambda': 0.04793414660944966, 'min_child_weight': 0.4429943118354462}. Best is trial 3 with value: 5.864844695204969.


🏃 View run legendary-shrew-466 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/7c20c567158f4a2cb6acce0a6babd499
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.66674
[1]	validation-rmse:8.28130
[2]	validation-rmse:7.94388
[3]	validation-rmse:7.65203
[4]	validation-rmse:7.39768
[5]	validation-rmse:7.17932
[6]	validation-rmse:6.99064
[7]	validation-rmse:6.82788
[8]	validation-rmse:6.68845
[9]	validation-rmse:6.56954
[10]	validation-rmse:6.47247
[11]	validation-rmse:6.38585
[12]	validation-rmse:6.31795
[13]	validation-rmse:6.25891
[14]	validation-rmse:6.21037
[15]	validation-rmse:6.17153
[16]	validation-rmse:6.13977
[17]	validation-rmse:6.11184
[18]	validation-rmse:6.09137
[19]	validation-rmse:6.07189
[20]	validation-rmse:6.05592
[21]	validation-rmse:6.04469
[22]	validation-rmse:6.03666
[23]	validation-rmse:6.03048
[24]	validation-rmse:6.02557
[25]	validation-rmse:6.02108
[26

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:20:36,283] Trial 6 finished with value: 6.024001572606648 and parameters: {'max_depth': 62, 'learning_rate': 0.08304043435235499, 'reg_alpha': 0.008740449782948887, 'reg_lambda': 0.2849127420798682, 'min_child_weight': 17.505727836123448}. Best is trial 3 with value: 5.864844695204969.


🏃 View run rebellious-swan-217 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/29a665327b8f4cf7801c6de80b13860a
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:8.45358
[1]	validation-rmse:7.91575
[2]	validation-rmse:7.48583
[3]	validation-rmse:7.13998
[4]	validation-rmse:6.87703
[5]	validation-rmse:6.67951
[6]	validation-rmse:6.52718
[7]	validation-rmse:6.41368
[8]	validation-rmse:6.33050
[9]	validation-rmse:6.27070
[10]	validation-rmse:6.23098
[11]	validation-rmse:6.19687
[12]	validation-rmse:6.17169
[13]	validation-rmse:6.16418
[14]	validation-rmse:6.16001
[15]	validation-rmse:6.15569
[16]	validation-rmse:6.15360
[17]	validation-rmse:6.15365
[18]	validation-rmse:6.15601
[19]	validation-rmse:6.15932
[20]	validation-rmse:6.16316
[21]	validation-rmse:6.16303
[22]	validation-rmse:6.16401
[23]	validation-rmse:6.17012
[24]	validation-rmse:6.16999
[25]	validation-rmse:6.17199
[26

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:20:58,285] Trial 7 finished with value: 6.172334932747793 and parameters: {'max_depth': 82, 'learning_rate': 0.12416316985362412, 'reg_alpha': 0.009958672056108932, 'reg_lambda': 0.0758623422350637, 'min_child_weight': 2.1395809133199974}. Best is trial 3 with value: 5.864844695204969.


🏃 View run nosy-loon-274 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/a5faf1cfeea649909d82518fac9568d7
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:7.96125
[1]	validation-rmse:7.19956
[2]	validation-rmse:6.69631
[3]	validation-rmse:6.37886
[4]	validation-rmse:6.17661
[5]	validation-rmse:6.05039
[6]	validation-rmse:5.98463
[7]	validation-rmse:5.93473
[8]	validation-rmse:5.90786
[9]	validation-rmse:5.89293
[10]	validation-rmse:5.88940
[11]	validation-rmse:5.88270
[12]	validation-rmse:5.87865
[13]	validation-rmse:5.87724
[14]	validation-rmse:5.87452
[15]	validation-rmse:5.87431
[16]	validation-rmse:5.87284
[17]	validation-rmse:5.87217
[18]	validation-rmse:5.87229
[19]	validation-rmse:5.87087
[20]	validation-rmse:5.87091
[21]	validation-rmse:5.87085
[22]	validation-rmse:5.87091
[23]	validation-rmse:5.86997
[24]	validation-rmse:5.86922
[25]	validation-rmse:5.86934
[26]	vali

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)


🏃 View run unruly-roo-249 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/07a71d088fac4cf2a09b4f86251af81d
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722


[I 2025-10-28 20:21:09,417] Trial 8 finished with value: 5.862230443089824 and parameters: {'max_depth': 15, 'learning_rate': 0.21992487468175848, 'reg_alpha': 0.007731550026907306, 'reg_lambda': 0.23377457337376362, 'min_child_weight': 1.0357439143907545}. Best is trial 8 with value: 5.862230443089824.


[0]	validation-rmse:8.41579
[1]	validation-rmse:7.85713
[2]	validation-rmse:7.42170
[3]	validation-rmse:7.07938
[4]	validation-rmse:6.80816
[5]	validation-rmse:6.60651
[6]	validation-rmse:6.45200
[7]	validation-rmse:6.32855
[8]	validation-rmse:6.24433
[9]	validation-rmse:6.18085
[10]	validation-rmse:6.11653
[11]	validation-rmse:6.07718
[12]	validation-rmse:6.04368
[13]	validation-rmse:6.02265
[14]	validation-rmse:6.01240
[15]	validation-rmse:6.00241
[16]	validation-rmse:5.99576
[17]	validation-rmse:5.99117
[18]	validation-rmse:5.98779
[19]	validation-rmse:5.98629
[20]	validation-rmse:5.98758
[21]	validation-rmse:5.98578
[22]	validation-rmse:5.98501
[23]	validation-rmse:5.98725
[24]	validation-rmse:5.98845
[25]	validation-rmse:5.99028
[26]	validation-rmse:5.99097
[27]	validation-rmse:5.99184
[28]	validation-rmse:5.99364
[29]	validation-rmse:5.99561
[30]	validation-rmse:5.99619
[31]	validation-rmse:5.99808


  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)
[I 2025-10-28 20:21:44,363] Trial 9 finished with value: 5.999083279452748 and parameters: {'max_depth': 68, 'learning_rate': 0.1268351874747755, 'reg_alpha': 0.05394836382863035, 'reg_lambda': 0.03814164293595655, 'min_child_weight': 0.7706028272535065}. Best is trial 8 with value: 5.862230443089824.


🏃 View run handsome-koi-50 at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722/runs/4d797ccf25c847e2975515d46d607a93
🧪 View experiment at: https://dbc-8a29a60a-e19d.cloud.databricks.com/ml/experiments/3813900182186722
[0]	validation-rmse:7.96125
[1]	validation-rmse:7.19956
[2]	validation-rmse:6.69631
[3]	validation-rmse:6.37886
[4]	validation-rmse:6.17661
[5]	validation-rmse:6.05039
[6]	validation-rmse:5.98463
[7]	validation-rmse:5.93473
[8]	validation-rmse:5.90786
[9]	validation-rmse:5.89293
[10]	validation-rmse:5.88940
[11]	validation-rmse:5.88270
[12]	validation-rmse:5.87865
[13]	validation-rmse:5.87724
[14]	validation-rmse:5.87452
[15]	validation-rmse:5.87431
[16]	validation-rmse:5.87284
[17]	validation-rmse:5.87217
[18]	validation-rmse:5.87229
[19]	validation-rmse:5.87087
[20]	validation-rmse:5.87091
[21]	validation-rmse:5.87085
[22]	validation-rmse:5.87091
[23]	validation-rmse:5.86997
[24]	validation-rmse:5.86922
[25]	validation-rmse:5.86934
[26]	va

  xgb_model.save_model(model_data_path)
  model.load_model(xgb_model_path)


KeyboardInterrupt: 

In [15]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"], 
    output_format="list"
)

In [21]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)


In [49]:
from mlflow import MlflowClient
client = MlflowClient()

result = mlflow.register_model(
    model_uri=f"runs:/{'4d797ccf25c847e2975515d46d607a93'}/model",
    name="workspace.default.nyc-taxi-model"
)



client.set_registered_model_alias(
    name= "workspace.default.nyc-taxi-model",
    alias='champion',
    version=result.version
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '5' of model 'workspace.default.nyc-taxi-model'.


In [23]:
from mlflow import MlflowClient
client = MlflowClient()

result = mlflow.register_model(
    model_uri=f"runs:/{runs[0].info.run_id}/model",
    name="workspace.default.nyc-taxi-model"
)



client.set_registered_model_alias(
    name= "workspace.default.nyc-taxi-model",
    alias='challenger',
    version=result.version
)


Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '4' of model 'workspace.default.nyc-taxi-model'.


In [41]:

df_test = read_dataframe('../data/green_tripdata_2025-03.parquet')

In [42]:

X_test = preprocess(df_test, dv)
y_test = df_test[target].values


# Create test dataset for MLflow
test_dataset = mlflow.data.from_numpy(X_test.data, targets=y_test, name="green_tripdata_2025-03")


In [50]:
model_name = "workspace.default.nyc-taxi-model"

champion_model = mlflow.pyfunc.load_model(f"models:/{model_name}@Champion")
challenger_model = mlflow.pyfunc.load_model(f"models:/{model_name}@Challenger")


client = MlflowClient()

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


In [51]:
import numpy as np
champion_predictions = np.array(champion_model.predict(X_test))
challenger_predictions = np.array(challenger_model.predict(X_test))

In [52]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def calculate_metrics(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    maep = np.mean(np.abs(y_true - y_pred) / y_true) * 100

    
    return {
        'model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'MAPE': mape,
        'MAEP': maep,
    }

champion_metrics = calculate_metrics(y_test, champion_predictions, 'Champion')
challenger_metrics = calculate_metrics(y_test, challenger_predictions, 'Challenger')


metrics_df = pd.DataFrame([champion_metrics, challenger_metrics])
metrics_df


Unnamed: 0,model,RMSE,MAE,R²,MAPE,MAEP
0,Champion,6.518679,4.08402,0.549948,33.732467,33.732467
1,Challenger,6.369204,3.98651,0.570351,34.318838,34.318838
