In [1]:
!python -V

Python 3.12.7


In [2]:
import numpy as np
import pandas as pd


In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1747474969262, experiment_id='1', last_update_time=1747474969262, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet') 

In [8]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [9]:
def read_dataframe(filename):
    df = pd.read_parquet(filename) 

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    #df.loc[:, categorical] = df.loc[:, categorical].astype(str)
    df[categorical] = df[categorical].astype(str)

    return df, categorical, numerical

In [10]:
df_train,categorical_train, numerical_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [11]:
df_train.head()
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [12]:
df_val,categorical_val, numerical_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [13]:
len(df_train), len(df_val)

(3009173, 2855951)

In [14]:
df_train['duration'].std()

9.939385620145579

In [15]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [16]:
#categorical = ['PU_DO'] #['PULocationID', 'DOLocationID']
#numerical = ['trip_distance']

dv = DictVectorizer()
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [17]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)


y_pred = lr.predict(X_val)



In [19]:
(y_val.size, y_pred.size, y_train.size)

(2855951, 2855951, 3009173)

In [20]:
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
print(rmse_train)

7.656752061706073


In [21]:
np.sqrt(mean_squared_error(y_val, y_pred))

7.818625928637492

In [22]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [23]:
with mlflow.start_run():

    mlflow.set_tag("developer", "luis")

    mlflow.log_param("train-data-path", "/trip-data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "/trip-data/yellow_tripdata_2023-02.parquet")

    alpha = 0.1
    
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("alpha", alpha)

In [24]:
import xgboost as xgb

In [25]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=60,
            evals=[(valid, 'validation')],
            early_stopping_rounds=20
        )
        y_pred = booster.predict(valid)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [26]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [27]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [33]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|                                                                                                                                     | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.50591                                                                                                                                                      
[1]	validation-rmse:8.99343                                                                                                                                                      
[2]	validation-rmse:8.52874                                                                                                                                                      
[3]	validation-rmse:8.10832                                                                                                                                                      
[4]	validation-rmse:7.72835                                                                                                                                                      
[5]	validation-rmse:7.38551                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.75457                                                                                                                                                      
[1]	validation-rmse:5.50262                                                                                                                                                      
[2]	validation-rmse:5.07781                                                                                                                                                      
[3]	validation-rmse:4.92264                                                                                                                                                      
[4]	validation-rmse:4.85634                                                                                                                                                      
[5]	validation-rmse:4.82043                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:6.12998                                                                                                                                                      
[1]	validation-rmse:4.99218                                                                                                                                                      
[2]	validation-rmse:4.73475                                                                                                                                                      
[3]	validation-rmse:4.67758                                                                                                                                                      
[4]	validation-rmse:4.66643                                                                                                                                                      
[5]	validation-rmse:4.66604                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.13997                                                                                                                                                      
[1]	validation-rmse:6.84962                                                                                                                                                      
[2]	validation-rmse:6.01434                                                                                                                                                      
[3]	validation-rmse:5.48186                                                                                                                                                      
[4]	validation-rmse:5.16235                                                                                                                                                      
[5]	validation-rmse:4.96590                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.45107                                                                                                                                                      
[1]	validation-rmse:8.89915                                                                                                                                                      
[2]	validation-rmse:8.40734                                                                                                                                                      
[3]	validation-rmse:7.97007                                                                                                                                                      
[4]	validation-rmse:7.58211                                                                                                                                                      
[5]	validation-rmse:7.23941                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.03505                                                                                                                                                      
[1]	validation-rmse:8.18688                                                                                                                                                      
[2]	validation-rmse:7.49725                                                                                                                                                      
[3]	validation-rmse:6.93961                                                                                                                                                      
[4]	validation-rmse:6.49492                                                                                                                                                      
[5]	validation-rmse:6.14088                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.65468                                                                                                                                                      
[1]	validation-rmse:9.26956                                                                                                                                                      
[2]	validation-rmse:8.91090                                                                                                                                                      
[3]	validation-rmse:8.57704                                                                                                                                                      
[4]	validation-rmse:8.26660                                                                                                                                                      
[5]	validation-rmse:7.97826                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.31043                                                                                                                                                      
[1]	validation-rmse:8.64456                                                                                                                                                      
[2]	validation-rmse:8.06301                                                                                                                                                      
[3]	validation-rmse:7.55346                                                                                                                                                      
[4]	validation-rmse:7.11435                                                                                                                                                      
[5]	validation-rmse:6.73202                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:8.60516                                                                                                                                                      
[1]	validation-rmse:7.51171                                                                                                                                                      
[2]	validation-rmse:6.70714                                                                                                                                                      
[3]	validation-rmse:6.12589                                                                                                                                                      
[4]	validation-rmse:5.70929                                                                                                                                                      
[5]	validation-rmse:5.41836                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.12178                                                                                                                                                      
[1]	validation-rmse:8.31729                                                                                                                                                      
[2]	validation-rmse:7.64294                                                                                                                                                      
[3]	validation-rmse:7.07380                                                                                                                                                      
[4]	validation-rmse:6.60485                                                                                                                                                      
[5]	validation-rmse:6.21350                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.21858                                                                                                                                                      
[1]	validation-rmse:5.83382                                                                                                                                                      
[2]	validation-rmse:5.21269                                                                                                                                                      
[3]	validation-rmse:4.94308                                                                                                                                                      
[4]	validation-rmse:4.81750                                                                                                                                                      
[5]	validation-rmse:4.74942                                                                                   

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:5.01599                                                                                                                                                      
[1]	validation-rmse:4.74761                                                                                                                                                      
[2]	validation-rmse:4.71070                                                                                                                                                      
[3]	validation-rmse:4.69838                                                                                                                                                      
[4]	validation-rmse:4.68964                                                                                                                                                      
[5]	validation-rmse:4.67712                                                                                   

KeyboardInterrupt: 

In [29]:
with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)
    
    selected_params = {
        "learning_rate": 0.26,
        "max_depth": 34,
        "min_child_weight": 9.692039304772168,
        "objective": "reg:linear",
        "reg_alpha": 0.046993480968278975,
        "reg_lambda": 0.014229494746605674,
        "seed": 42
    }
    mlflow.log_params(selected_params)
    
    booster = xgb.train(
                params=selected_params,
                dtrain=train,
                num_boost_round=60,
                evals=[(valid, 'validation')],
                early_stopping_rounds=20
            )
    y_pred = booster.predict(valid)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:8.15805
[1]	validation-rmse:6.87305
[2]	validation-rmse:6.03719
[3]	validation-rmse:5.50058
[4]	validation-rmse:5.17528
[5]	validation-rmse:4.97317
[6]	validation-rmse:4.85050
[7]	validation-rmse:4.77633
[8]	validation-rmse:4.72091
[9]	validation-rmse:4.68974
[10]	validation-rmse:4.66866
[11]	validation-rmse:4.65416
[12]	validation-rmse:4.64301
[13]	validation-rmse:4.63290
[14]	validation-rmse:4.62694
[15]	validation-rmse:4.62069
[16]	validation-rmse:4.61663
[17]	validation-rmse:4.61310
[18]	validation-rmse:4.60904
[19]	validation-rmse:4.60729
[20]	validation-rmse:4.60468
[21]	validation-rmse:4.60334
[22]	validation-rmse:4.60145
[23]	validation-rmse:4.59835
[24]	validation-rmse:4.59618
[25]	validation-rmse:4.59423
[26]	validation-rmse:4.59350
[27]	validation-rmse:4.58978
[28]	validation-rmse:4.58815
[29]	validation-rmse:4.58753
[30]	validation-rmse:4.57913
[31]	validation-rmse:4.57844
[32]	validation-rmse:4.57719
[33]	validation-rmse:4.57535
[34]	validation-rmse:4.5

  xgb_model.save_model(model_data_path)


Predict with the model

In [30]:
logged_model = 'runs:/dcf79c2a35f54616b26240f54ece4c27/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)


In [31]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: dcf79c2a35f54616b26240f54ece4c27

In [32]:
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [33]:
xgboost_model

<xgboost.core.Booster at 0x7868d1e322d0>

In [34]:
t_t_pred = xgboost_model.predict(valid)

In [35]:
t_t_pred[:10]

array([ 3.142294 , 43.56321  , 15.531823 , 22.173428 , 23.68785  ,
        7.1937356, 18.131138 ,  4.094047 ,  5.4557652,  9.429992 ],
      dtype=float32)

Model Deployment

In [2]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [34]:
client.create_experiment(name="my-experiment")

'2'

In [38]:
runs= client.search_runs(
    experiment_ids ='1',
    filter_string="",
    #run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [40]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: dcf79c2a35f54616b26240f54ece4c27, rmse: 4.5405
run id: b4d1d772afab4b2f9c3824023dc0f1b6, rmse: 4.5409
run id: cf662cbaa4d849598a41d9c56533960b, rmse: 4.5512
run id: c209e08994e0426691ad6ed06603f9b5, rmse: 4.5513
run id: 724ff8e81efa4075b82186544075c7bc, rmse: 4.5565


In [42]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [43]:
run_id="724ff8e81efa4075b82186544075c7bc"
model_uri= f"runs:/{run_id}/model"


mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '4' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1748202072261, current_stage='None', description=None, last_updated_timestamp=1748202072261, name='nyc-taxi-regressor', run_id='724ff8e81efa4075b82186544075c7bc', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/724ff8e81efa4075b82186544075c7bc/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [44]:
model_uri

'runs:/724ff8e81efa4075b82186544075c7bc/model'

In [50]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"version: {version.version}")

version: 4


  latest_versions = client.get_latest_versions(name=model_name)
