In [1]:
from mlflow.tracking import MlflowClient

In [2]:
TRACKING_URI = 'sqlite:///mlflow.db'
client = MlflowClient(tracking_uri=TRACKING_URI)

In [5]:
all_registered_models = client.search_experiments()
print(f"There are {len(all_registered_models)} registered models: \n{all_registered_models}")

There are 2 registered models: 
[<Experiment: artifact_location='/Users/nabe/Desktop/Research/Formations/MLOps/DataTalks_MLOPS/mlops-zoomcamp/02_experiment_tracking/mlruns/1', creation_time=1746695003354, experiment_id='1', last_update_time=1746695003354, lifecycle_stage='active', name='NYC-taxi-experiment', tags={}>, <Experiment: artifact_location='/Users/nabe/Desktop/Research/Formations/MLOps/DataTalks_MLOPS/mlops-zoomcamp/02_experiment_tracking/mlruns/0', creation_time=1746695003352, experiment_id='0', last_update_time=1746695003352, lifecycle_stage='active', name='Default', tags={}>]


In [10]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string='',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

In [11]:
for run in runs:
    print(f"run id = {run.info.run_id}, RMSE = {run.data.metrics['rmse']:.2f}")

run id = 2109e9a00c9a4a3bacc87ec9386738a3, RMSE = 5.18
run id = ed98e9ccacde4701899ddf0778355329, RMSE = 5.18
run id = bc87f716daff4e288724316810d2b6c9, RMSE = 5.18
run id = d3b4a9008ffc4266bf1e736780de579d, RMSE = 5.18
run id = d1483b43d6964e2c8975eee84b2b97fb, RMSE = 5.18


In [12]:
import mlflow

mlflow.set_tracking_uri(TRACKING_URI)

In [13]:
run_id = 'd3b4a9008ffc4266bf1e736780de579d'
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(
    model_uri=model_uri,
    name="nyc-taxi-duration-prediction"
)

Registered model 'nyc-taxi-duration-prediction' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-duration-prediction'.


<ModelVersion: aliases=[], creation_timestamp=1747941826466, current_stage='None', description=None, last_updated_timestamp=1747941826466, name='nyc-taxi-duration-prediction', run_id='d3b4a9008ffc4266bf1e736780de579d', run_link=None, source='/Users/nabe/Desktop/Research/Formations/MLOps/DataTalks_MLOPS/mlops-zoomcamp/02_experiment_tracking/mlruns/1/d3b4a9008ffc4266bf1e736780de579d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [29]:
model_name = 'nyc-taxi-duration-prediction'
latest_versions = client.search_registered_models()

for version in latest_versions:
    print(version)
    # print(f"version {version.version}, stage = {version.status}")

<RegisteredModel: aliases={'version1': 1, 'version2': 2, 'version3': 3}, creation_timestamp=1747939632445, description='', last_updated_timestamp=1747941826466, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1747941826466, current_stage='None', description=None, last_updated_timestamp=1747941826466, name='nyc-taxi-duration-prediction', run_id='d3b4a9008ffc4266bf1e736780de579d', run_link=None, source='/Users/nabe/Desktop/Research/Formations/MLOps/DataTalks_MLOPS/mlops-zoomcamp/02_experiment_tracking/mlruns/1/d3b4a9008ffc4266bf1e736780de579d/artifacts/model', status='READY', status_message=None, tags={'regression': 'version3'}, user_id=None, version=3>], name='nyc-taxi-duration-prediction', tags={}>


In [40]:
from sklearn.metrics import root_mean_squared_error
import pandas as pd


def read_dataframe(fn, categorical_cols):
    df = pd.read_parquet(fn)
    # create duration (in minutes) feature
    df['duration'] = (
        df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    ).dt.total_seconds().div(60.)
    # filter out outliers: trips should be between 1 and 60 minutes.
    df = df[
        (1 <= df.duration) & (df.duration <= 60.)
    ]
    # convert categorical columns to string data type
    df[categorical_cols] = df[categorical_cols].astype(str)
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, model_version, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{model_version}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [34]:
cat_cols = ['PULocationID', 'DOLocationID']
df = read_dataframe("../data/yellow_tripdata_2023-03.parquet", cat_cols)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


In [36]:
run_id = "7ca1f33e2fd04763b67260411adeba42"
client.download_artifacts(
    run_id=run_id,
    path='preprocessor',
    dst_path='.'
)

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 275.49it/s]


'/Users/nabe/Desktop/Research/Formations/MLOps/DataTalks_MLOPS/mlops-zoomcamp/02_experiment_tracking/preprocessor'

In [37]:
import pickle

with open('preprocessor/preprocessor.pkl', 'rb') as fr:
    dv = pickle.load(fr)

In [38]:
X_test = preprocess(df, dv)

In [39]:
target = 'duration'
y_test = df[target].values

In [42]:
%time test_model(name=model_name, model_version=2, X_test=X_test, y_test=y_test)

CPU times: user 4min 20s, sys: 1.49 s, total: 4min 22s
Wall time: 36.3 s


{'rmse': 10.639156209204334}

In [43]:
mlflow.__version__

'2.22.0'