# Model Registry

In [1]:
from mlflow.tracking import MlflowClient

MFLOW_TRACKING_URI = "sqlite:///mlflow.db" # Use the same database as before so that you have access to the created ML models

client = MlflowClient(tracking_uri=MFLOW_TRACKING_URI)

In [28]:
# List all the created experiments
# client.list_experiments() # this has been removed
client.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/2', creation_time=1716238410274, experiment_id='2', last_update_time=1716238410274, lifecycle_stage='active', name='my-cool-experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/1', creation_time=1716042712622, experiment_id='1', last_update_time=1716042712622, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1716040800824, experiment_id='0', last_update_time=1716040800824, lifecycle_stage='active', name='Default', tags={}>]

In [4]:
# Create a new experiment (before we used the UI, now we use the API)
client.create_experiment(name="my-cool-experiment")

'2'

In [29]:
# Find the best runs for a given experiment

from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=["1"],
    filter_string="metrics.rmse < 6.4", # filtering condition for the displayed runs
    run_view_type=ViewType.ACTIVE_ONLY, # filter only active runs
    max_results=5, # max number of runs to return
    order_by=["metrics.rmse ASC"] # criteria (can be multiple) by which to order the runs
)

In [30]:
for run in runs:
    print("run id: {0}, rmse: {1:.4f}".format(run.info.run_id, run.data.metrics["rmse"]))

run id: ced582d1bf854f7eb8aa7857a2b3a10c, rmse: 6.3124
run id: 3a944c17b6b74ae7b726495bf7cfbdcc, rmse: 6.3124
run id: 94c4456831f1404091dede64d90daa37, rmse: 6.3124
run id: a15d9b677ee641a3847812a61d9901e5, rmse: 6.3274
run id: b5cde2b603c2427084d208f4ffd2c2ee, rmse: 6.3495


# Promote selected models to the Model Registry

In [9]:
# Promote some of the models to the Model Registry
import mlflow 

mlflow.set_tracking_uri(MFLOW_TRACKING_URI) # set up the tracking server

In [12]:
# Register a selected model
run_id = "1623fa04b82749d4998597a633c05ea9"
model_uri = "runs:/{run_id}/model".format(run_id=run_id)

mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1716239365081, current_stage='None', description=None, last_updated_timestamp=1716239365081, name='nyc-taxi-regressor', run_id='1623fa04b82749d4998597a633c05ea9', run_link=None, source='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/1/1623fa04b82749d4998597a633c05ea9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [38]:
model_uri

'runs:/1623fa04b82749d4998597a633c05ea9/model'

# Transition a registered model to a New Stage

In [39]:
model_name = 'nyc-taxi-regressor'
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print("version: {0}, stage: {1}".format(version.version, version.current_stage))



version: 3, stage: Staging


  latest_versions = client.get_latest_versions(name=model_name)


In [40]:
# Transition a model to a new Stage
model_version = 3
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False   
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1716239365081, current_stage='Staging', description='The model version 3 was transitioned to Staging stage on 2024-05-20.', last_updated_timestamp=1716242517629, name='nyc-taxi-regressor', run_id='1623fa04b82749d4998597a633c05ea9', run_link=None, source='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/1/1623fa04b82749d4998597a633c05ea9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [41]:
# Annotate the models
from datetime import datetime
date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=3,
    description="The model version {0} was transitioned to {1} stage on {2}.".format(model_version, new_stage, date)
)

<ModelVersion: aliases=[], creation_timestamp=1716239365081, current_stage='Staging', description='The model version 3 was transitioned to Staging stage on 2024-05-20.', last_updated_timestamp=1716242520396, name='nyc-taxi-regressor', run_id='1623fa04b82749d4998597a633c05ea9', run_link=None, source='/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/mlruns/1/1623fa04b82749d4998597a633c05ea9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

# Comparing the registered models

Datasets used: 
- Green taxi data for January 2021 was used for training
- February 2021 data was used for validation
- March 2021 data will be used for testing (selecting the model we want to promote to production)

In [25]:
from sklearn.metrics import root_mean_squared_error
import pandas as pd


def read_dataframe(filename):
    """
    Read and preprocess the dataframe
    """
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    """
    Preprocess the dataframe using the DictVectorizer preprocessor
    """
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')

    return dv.transform(train_dicts) # Notice that we are not fitting the DictVectorizer here, but rather re-using the already fitted and existing preprocessor


def test_model(name, stage, X_test, y_test):
    """
    name: name of the model (e.g., nyc-taxi-regressor)
    stage: stage of the model (e.g., Staging, Production)
    """
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}") # Load the model from the Model Registry as a Python function
    y_pred = model.predict(X_test)

    
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [26]:
%%time

df = read_dataframe("./data/green_tripdata_2021-03.parquet")

CPU times: user 300 ms, sys: 73.5 ms, total: 374 ms
Wall time: 536 ms


In [31]:
# Load the logged DictVectorizer preprocessor for a given run_id
run_id = "ced582d1bf854f7eb8aa7857a2b3a10c"
client.download_artifacts(run_id=run_id, path="preprocessor", dst_path=".") # The preprocessor will be loaded in the preprocessor folder the current directory

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/preprocessor'

In [32]:
import pickle

with open("./preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [33]:
%%time 

# Create the test set
X_test = preprocess(df, dv)

CPU times: user 190 ms, sys: 16.5 ms, total: 206 ms
Wall time: 231 ms


In [42]:
# Create the target variable

target = "duration"
y_test = df[target].values

In [43]:
model_name

'nyc-taxi-regressor'

In [45]:
# Compare Production and Staging models 
# You should make sure the model you're going to use has an associated run in the mlruns/1 folder 
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

In [None]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

In [46]:
# If you find that the Staging model is better than the Production model, you can transition the Staging model to Production
# Use the code from above to do that