In [1]:
!python -V

Python 3.9.23


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2025/07/08 06:54:19 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/08 06:54:19 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/mlops-basic/02-experiment-tracking/mlruns/1', creation_time=1751954357296, experiment_id='1', last_update_time=1751954357296, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df_train = pd.read_parquet('./data/green_tripdata_2021-01.parquet')
df_val = pd.read_parquet('./data/green_tripdata_2021-02.parquet')

In [9]:
len(df_train), len(df_val)

(76518, 64572)

In [10]:
df_train['PU_DO'] = df_train['PULocationID'].astype(str) + '_' + df_train['DOLocationID'].astype(str)
df_val['PU_DO'] = df_val['PULocationID'].astype(str) + '_' + df_val['DOLocationID'].astype(str)

In [11]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
# Compute duration column if not present
if 'duration' not in df_train.columns:
	df_train['duration'] = (df_train.lpep_dropoff_datetime - df_train.lpep_pickup_datetime).dt.total_seconds() / 60
if 'duration' not in df_val.columns:
	df_val['duration'] = (df_val.lpep_dropoff_datetime - df_val.lpep_pickup_datetime).dt.total_seconds() / 60

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

import numpy as np
np.sqrt(mean_squared_error(y_val, y_pred))

np.float64(53.60353017829977)

In [14]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [15]:
with mlflow.start_run():

    mlflow.set_tag("developer", "manali")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.parquet")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [16]:
import xgboost as xgb

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [18]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [20]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, -0.3),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	validation-rmse:52.35917                          
  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




[1]	validation-rmse:52.21828                          
[2]	validation-rmse:52.18885                          
[3]	validation-rmse:52.17491                          
[4]	validation-rmse:52.23369                          
[5]	validation-rmse:52.27531                          
[6]	validation-rmse:52.29342                          
[7]	validation-rmse:52.33867                          
[8]	validation-rmse:52.42663                          
[9]	validation-rmse:52.51353                          
[10]	validation-rmse:52.53612                         
[11]	validation-rmse:52.55566                         
[12]	validation-rmse:52.56486                         
[13]	validation-rmse:52.57418                         
[14]	validation-rmse:52.62714                         
[15]	validation-rmse:52.65470                         
[16]	validation-rmse:52.69573                         
[17]	validation-rmse:52.71136                         
[18]	validation-rmse:52.74169                         
[19]	valid




[0]	validation-rmse:53.07181                                                   
[1]	validation-rmse:53.01167                                                   
[2]	validation-rmse:52.99856                                                   
[3]	validation-rmse:53.01790                                                   
[4]	validation-rmse:53.07160                                                   
[5]	validation-rmse:53.12883                                                   
[6]	validation-rmse:53.18156                                                   
[7]	validation-rmse:53.27621                                                   
[8]	validation-rmse:53.37766                                                   
[9]	validation-rmse:53.46660                                                   
[10]	validation-rmse:53.58154                                                  
[11]	validation-rmse:53.68277                                                  
[12]	validation-rmse:53.77412           




[0]	validation-rmse:52.44419                                                   
[1]	validation-rmse:52.58142                                                   
[2]	validation-rmse:52.72943                                                   
[3]	validation-rmse:52.78637                                                   
[4]	validation-rmse:52.77667                                                   
[5]	validation-rmse:52.85465                                                   
[6]	validation-rmse:52.87849                                                   
[7]	validation-rmse:52.94698                                                   
[8]	validation-rmse:52.98639                                                   
[9]	validation-rmse:52.96797                                                   
[10]	validation-rmse:53.02082                                                  
[11]	validation-rmse:53.03245                                                  
[12]	validation-rmse:53.06399           




[0]	validation-rmse:53.02390                                                   
[1]	validation-rmse:52.90674                                                   
[2]	validation-rmse:52.82457                                                   
[3]	validation-rmse:52.76489                                                   
[4]	validation-rmse:52.71470                                                   
[5]	validation-rmse:52.68928                                                   
[6]	validation-rmse:52.67958                                                   
[7]	validation-rmse:52.69138                                                   
[8]	validation-rmse:52.70409                                                   
[9]	validation-rmse:52.73028                                                   
[10]	validation-rmse:52.75789                                                  
[11]	validation-rmse:52.79886                                                  
[12]	validation-rmse:52.84447           




[0]	validation-rmse:55.26236                                                   
[1]	validation-rmse:56.43545                                                   
[2]	validation-rmse:57.43228                                                   
[3]	validation-rmse:57.96534                                                   
[4]	validation-rmse:58.09884                                                   
[5]	validation-rmse:58.38552                                                   
[6]	validation-rmse:58.88529                                                   
[7]	validation-rmse:59.21959                                                   
[8]	validation-rmse:59.78352                                                   
[9]	validation-rmse:59.82755                                                   
[10]	validation-rmse:60.67629                                                  
[11]	validation-rmse:60.73299                                                  
[12]	validation-rmse:60.77799           




[0]	validation-rmse:52.57126                                                   
[1]	validation-rmse:52.55724                                                   
[2]	validation-rmse:52.63187                                                   
[3]	validation-rmse:52.69670                                                   
[4]	validation-rmse:52.68458                                                   
[5]	validation-rmse:52.76753                                                   
[6]	validation-rmse:52.83083                                                   
[7]	validation-rmse:52.89463                                                   
[8]	validation-rmse:52.94698                                                   
[9]	validation-rmse:53.01452                                                   
[10]	validation-rmse:53.01698                                                  
[11]	validation-rmse:53.06011                                                  
[12]	validation-rmse:53.09845           




[2]	validation-rmse:52.65190                                                   
[3]	validation-rmse:53.03326                                                   
[4]	validation-rmse:53.13381                                                   
[5]	validation-rmse:53.29682                                                   
[6]	validation-rmse:53.41650                                                   
[7]	validation-rmse:53.74240                                                   
[8]	validation-rmse:53.87121                                                   
[9]	validation-rmse:54.07500                                                   
[10]	validation-rmse:54.17768                                                  
[11]	validation-rmse:54.30375                                                  
[12]	validation-rmse:54.34659                                                  
[13]	validation-rmse:54.37322                                                  
[14]	validation-rmse:54.55253           




[0]	validation-rmse:52.96032                                                   
[1]	validation-rmse:52.81439                                                   
[2]	validation-rmse:52.71749                                                   
[3]	validation-rmse:52.65967                                                   
[4]	validation-rmse:52.62515                                                   
[5]	validation-rmse:52.61918                                                   
[6]	validation-rmse:52.62624                                                   
[7]	validation-rmse:52.63945                                                   
[8]	validation-rmse:52.66627                                                   
[9]	validation-rmse:52.70275                                                   
[10]	validation-rmse:52.73508                                                  
[11]	validation-rmse:52.77998                                                  
[12]	validation-rmse:52.80848           




[2]	validation-rmse:52.78622                                                   
[3]	validation-rmse:52.69220                                                   
[4]	validation-rmse:52.60793                                                   
[5]	validation-rmse:52.53397                                                   
[6]	validation-rmse:52.47638                                                   
[7]	validation-rmse:52.42116                                                   
[8]	validation-rmse:52.37795                                                   
[9]	validation-rmse:52.34344                                                   
[10]	validation-rmse:52.31024                                                  
[11]	validation-rmse:52.28901                                                  
[12]	validation-rmse:52.26555                                                  
[13]	validation-rmse:52.24665                                                  
[14]	validation-rmse:52.23434           




[0]	validation-rmse:52.71207                                                   
[1]	validation-rmse:53.06822                                                   
[2]	validation-rmse:53.44285                                                   
[3]	validation-rmse:53.67069                                                   
[4]	validation-rmse:53.78371                                                   
[5]	validation-rmse:53.94671                                                   
[6]	validation-rmse:54.03036                                                   
[7]	validation-rmse:54.07436                                                   
[8]	validation-rmse:54.30625                                                   
[9]	validation-rmse:54.30815                                                   
[10]	validation-rmse:54.32207                                                  
[11]	validation-rmse:54.37296                                                  
[12]	validation-rmse:54.38092           

In [21]:
mlflow.xgboost.autolog(disable=True)

In [22]:
print("Best result:", best_result)

Best result: {'learning_rate': np.float64(0.06572486614407753), 'max_depth': np.float64(9.0), 'min_child_weight': np.float64(5.04728992821492), 'reg_alpha': np.float64(0.029449179470630867), 'reg_lambda': np.float64(0.007731792334660835)}


In [None]:
import mlflow
import mlflow.xgboost
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
    'learning_rate': 0.09585,
    'max_depth': 6,  # Reduce from 10
    'min_child_weight': 1.06,
    'objective': 'reg:squarederror',  # Updated
    'reg_alpha': 0.01806,
    'reg_lambda': 0.01166,
    'tree_method': 'hist',  # Efficient memory use
    'seed': 42
}
    mlflow.set_tag("model", "xgboost")
    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.parquet")
    mlflow.log_param("num-boost-rounds", 200)
    mlflow.log_param("early-stopping-rounds", 10)   


    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=200,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )

    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, name="model")


[0]	validation-rmse:52.98039




[1]	validation-rmse:52.83291
[2]	validation-rmse:52.72407
[3]	validation-rmse:52.63504
[4]	validation-rmse:52.57680
[5]	validation-rmse:52.54258
[6]	validation-rmse:52.52405
[7]	validation-rmse:52.51788
[8]	validation-rmse:52.50589
[9]	validation-rmse:52.50879
[10]	validation-rmse:52.50827
[11]	validation-rmse:52.52969
[12]	validation-rmse:52.54888
[13]	validation-rmse:52.55890
[14]	validation-rmse:52.58324
[15]	validation-rmse:52.60151
[16]	validation-rmse:52.62888
[17]	validation-rmse:52.65565




: 

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.parquet")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)
        