In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error
from sklearn import ensemble
import pickle

In [2]:
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/ml-ops/02-experiment-tracking/mlruns/1', creation_time=1749555359563, experiment_id='1', last_update_time=1749555359563, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet')

In [4]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0


In [5]:
df = df[:10000].copy()

In [6]:
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).apply(lambda tz: tz.seconds / 60)

In [7]:
df = df[((df.duration >= 1) & (df.duration <= 60))]

In [8]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [12]:
train_dicts = df[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer()
X = dv.fit_transform(train_dicts)

In [13]:
target = 'duration'
y = df[target].values

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [23]:
mlflow.autolog()

2025/06/10 12:34:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/06/10 12:34:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [24]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

In [28]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

2025/06/10 12:37:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4635d68888de4847b4d0a5eb5f1bf8e3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


The mean squared error (MSE) on test set: 29.5108


In [33]:
from hyperopt import hp
from hyperopt import fmin, tpe, space_eval, STATUS_OK, Trials
from hyperopt.pyll import scope

In [36]:
# define an objective function
def objective(params):
    reg = ensemble.GradientBoostingRegressor(**params)
    reg.fit(X_train, y_train)
    mse = mean_squared_error(y_val, reg.predict(X_val))
    return {'loss': mse, 'status': STATUS_OK}
    
# define a search space
search_space = {
    "n_estimators": 500,
    "min_samples_split": 5,
    "loss": "squared_error",
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0)
}

# minimize the objective over the space

best = fmin(objective, search_space, algo=tpe.suggest, max_evals=100, trials=Trials())

print(best)
# -> {'a': 1, 'c2': 0.01420615366247227}
print(space_eval(space, best))
# -> ('case 2', 0.01420615366247227}

  0%|                                                                      | 0/100 [00:00<?, ?trial/s, best loss=?]

2025/06/10 12:57:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b387ae5f55f449e9894d95dd70df852c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  1%|▍                                            | 1/100 [00:31<51:40, 31.32s/trial, best loss: 36.14766630772024]

2025/06/10 12:58:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e9efe0d613544315a64d7977d8599498', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  2%|▊                                         | 2/100 [01:20<1:08:21, 41.85s/trial, best loss: 35.913235336609446]

2025/06/10 12:59:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1aa2ae129e904961be61f5257ea156ef', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  3%|█▎                                        | 3/100 [02:04<1:09:13, 42.82s/trial, best loss: 35.913235336609446]

2025/06/10 12:59:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '78050899edbe4400ab7410dec575830c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  4%|█▋                                        | 4/100 [02:55<1:13:27, 45.91s/trial, best loss: 35.913235336609446]

2025/06/10 13:00:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6f35f7487fa14f39b0ca5b6f68dc533b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  5%|██                                        | 5/100 [03:30<1:06:54, 42.25s/trial, best loss: 35.913235336609446]

2025/06/10 13:01:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3e0c33e42f2e4c41a12e8837ecffb60b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  6%|██▌                                       | 6/100 [04:17<1:08:23, 43.66s/trial, best loss: 35.913235336609446]

2025/06/10 13:02:07 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '040f837560ca415f8744a02b847173ad', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  7%|██▉                                       | 7/100 [04:55<1:04:58, 41.92s/trial, best loss: 35.685636111305506]

2025/06/10 13:02:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '522ba385a602479888fd4aec038ff6f8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  8%|███▎                                      | 8/100 [05:33<1:02:18, 40.63s/trial, best loss: 35.685636111305506]

2025/06/10 13:03:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '34a6153c4fdf4d47a6f09e10dda3f72c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



  9%|███▊                                      | 9/100 [06:19<1:04:10, 42.31s/trial, best loss: 35.685636111305506]

2025/06/10 13:04:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a71636ff804d499187eebc6c901ba545', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



 10%|████                                     | 10/100 [07:15<1:09:50, 46.56s/trial, best loss: 35.685636111305506]

2025/06/10 13:05:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd38effccb6c84775a262180083192957', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



 11%|████▌                                    | 11/100 [08:02<1:09:18, 46.72s/trial, best loss: 35.685636111305506]

2025/06/10 13:05:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3d76d025e5134a3db97a4e8d0acc8747', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



 12%|█████▏                                     | 12/100 [08:11<51:42, 35.26s/trial, best loss: 25.487425272366067]

2025/06/10 13:06:02 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f0e0d3f8210b4fa6ae8959d0d37c0fe1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



 12%|████▉                                    | 12/100 [08:38<1:03:25, 43.24s/trial, best loss: 25.487425272366067]


KeyboardInterrupt: 

## Pick the best one

In [37]:
params = {
    "n_estimators": 500,
    "max_depth": 5,
    "min_samples_split": 5,
    "learning_rate": 0.06657731945633238,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_val, reg.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

2025/06/10 13:08:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6a3551b07b8e4baa8d5d199fe545f454', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


The mean squared error (MSE) on test set: 25.5263
