In [28]:
import pickle

In [6]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [7]:
# using the default tracking uri
MLFLOW_TRACKING_URI = 'sqlite:///mlflow.db'

In [8]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [9]:
# create an experiment
client.create_experiment('my_cool_experiment_11')

'5'

In [10]:
runs = client.search_runs(
    experiment_ids='2',
    filter_string='',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metric.rmse ASC']
)

In [11]:
# check the runs
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: fad9c946c9d64fabbaa9f138b1a79120, rmse: 6.4249
run id: 853e49ef71004c159997bfea58e99af4, rmse: 6.4531
run id: bd3a9ef67002412eaaff9c0bfc6812cd, rmse: 6.4531
run id: e1ca3b083e704f6c98bf2848d938c6eb, rmse: 6.5588
run id: 4094b6c79f0247a48d1f2e1750a01f53, rmse: 7.3441


In [12]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [26]:
# add the model to the registry
run_id = '853e49ef71004c159997bfea58e99af4'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name='nyc-taxi-xgboost')

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
Created version '5' of model 'nyc-taxi-xgboost'.


<ModelVersion: aliases=[], creation_timestamp=1723246964694, current_stage='None', description=None, last_updated_timestamp=1723246964694, name='nyc-taxi-xgboost', run_id='853e49ef71004c159997bfea58e99af4', run_link=None, source='/home/penscola/Project/MLflow-Experiment-Tracking/mlruns/2/853e49ef71004c159997bfea58e99af4/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [14]:
# checking registered models
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1723130230109, description='New York city time predictor', last_updated_timestamp=1723145453753, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1723130243131, current_stage='Staging', description='', last_updated_timestamp=1723130318090, name='nyc-taxi-xgboost', run_id='853e49ef71004c159997bfea58e99af4', run_link='', source='/home/penscola/Project/MLflow-Experiment-Tracking/mlruns/2/853e49ef71004c159997bfea58e99af4/artifacts/xgboost-model', status='READY', status_message=None, tags={}, user_id=None, version=2>,
  <ModelVersion: aliases=[], creation_timestamp=1723145453753, current_stage='None', description=None, last_updated_timestamp=1723145453753, name='nyc-taxi-xgboost', run_id='e1ca3b083e704f6c98bf2848d938c6eb', run_link=None, source='/home/penscola/Project/MLflow-Experiment-Tracking/mlruns/2/e1ca3b083e704f6c98bf2848d938c6eb/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>], na

In [15]:
model_name = 'nyc-taxi-xgboost'
latest_version = client.get_latest_versions(name = model_name)
for version in latest_version:
    print(f'version: {version.version}, stage: {version.current_stage}')

  latest_version = client.get_latest_versions(name = model_name)


version: 2, stage: Staging
version: 4, stage: None


In [17]:
model_version = 4
new_stage = 'Staging'

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1723145453753, current_stage='Staging', description=None, last_updated_timestamp=1723185379889, name='nyc-taxi-xgboost', run_id='e1ca3b083e704f6c98bf2848d938c6eb', run_link=None, source='/home/penscola/Project/MLflow-Experiment-Tracking/mlruns/2/e1ca3b083e704f6c98bf2848d938c6eb/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [18]:
from datetime import datetime

In [19]:
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f'The model version {model_version} was transitioned to {new_stage} on {date}'
)

<ModelVersion: aliases=[], creation_timestamp=1723145453753, current_stage='Staging', description='The model version 4 was transitioned to Staging on 2024-08-09', last_updated_timestamp=1723185535001, name='nyc-taxi-xgboost', run_id='e1ca3b083e704f6c98bf2848d938c6eb', run_link=None, source='/home/penscola/Project/MLflow-Experiment-Tracking/mlruns/2/e1ca3b083e704f6c98bf2848d938c6eb/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

##### Note: the model registry doesn't actually deploy the model to production when you transition a model to the "Production" stage, it just assign a label to that model version. You should complement the registry with some CI/CD code that does the actual deployment.

In [21]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [23]:
df = read_dataframe('./data/green_tripdata_2021-03.parquet')

In [27]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/home/penscola/Project/MLflow-Experiment-Tracking/preprocessor'

In [29]:
with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)

In [30]:
X_test = preprocess(df, dv)

In [31]:
target = 'duration'
y_test = df[target].values

In [32]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


CPU times: user 3.56 s, sys: 544 ms, total: 4.1 s
Wall time: 5.49 s




{'rmse': 6.39343658848217}