In [42]:
import mlflow
import pandas as pd
import pickle

from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from datetime import datetime

from sklearn.metrics import mean_squared_error

In [9]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

In [10]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [13]:
runs = client.search_runs(
    experiment_ids = 1,
    filter_string = '', # example: metrics.rmse < 6
    run_view_type = ViewType.ACTIVE_ONLY,
    max_results = 5,
    order_by = ["metrics.rmse ASC"]
)

In [14]:
runs

[<Run: data=<RunData: metrics={'rmse': 4.953331649346428}, params={'learning_rate': '0.052171063680732786',
  'max_depth': '43',
  'min_child_weight': '3.873525382081908',
  'objective': 'reg:linear',
  'reg_alpha': '0.02908096307849183',
  'reg_lambda': '0.005592226647424518',
  'seed': '42'}, tags={'mlflow.runName': 'bouncy-moth-75',
  'mlflow.source.git.commit': '87ebffcf1a94f4645767717b8d2774afc2aa1ef0',
  'mlflow.source.name': 'c:\\Users\\marcospp\\Documents\\MarcosPaulo\\Estudos\\MLOps-Zoomcamp\\.venv\\Lib\\site-packages\\ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'marcospp',
  'model': 'xgboost'}>, info=<RunInfo: artifact_uri='file:///c:/Users/marcospp/Documents/MarcosPaulo/Estudos/MLOps-Zoomcamp/02-experiment-tracking/mlruns/1/3bcfa63a5c164e5da6e8a9c98b327ee4/artifacts', end_time=1684865078715, experiment_id='1', lifecycle_stage='active', run_id='3bcfa63a5c164e5da6e8a9c98b327ee4', run_name='bouncy-moth-75', run_uuid='3bcfa63a5c164e5da6e8a9c98b327e

In [18]:
for run in runs:
    print(f'run id: {run.info.run_id}, rmse: {run.data.metrics["rmse"]:.4f}')

run id: 3bcfa63a5c164e5da6e8a9c98b327ee4, rmse: 4.9533
run id: eeb204e0081e48b1a31074fa16bcb4ae, rmse: 4.9725
run id: 71c8e57657f046519fc007e11d7e33cf, rmse: 5.0379
run id: 27c14687bceb401bad73fc5d2085d735, rmse: 5.4090
run id: 4f7c2ffd37154f9a88e760c8da0cfa6d, rmse: 9.3749


In [20]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [22]:
run_id = '3bcfa63a5c164e5da6e8a9c98b327ee4'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name='nyc-taxi-regressor')

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/26 11:21:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 3
Created version '3' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685096503410, current_stage='None', description=None, last_updated_timestamp=1685096503410, name='nyc-taxi-regressor', run_id='3bcfa63a5c164e5da6e8a9c98b327ee4', run_link=None, source='file:///c:/Users/marcospp/Documents/MarcosPaulo/Estudos/MLOps-Zoomcamp/02-experiment-tracking/mlruns/1/3bcfa63a5c164e5da6e8a9c98b327ee4/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [58]:
model_name = 'nyc-taxi-regressor'
latest_version = client.get_latest_versions(name=model_name)

In [59]:
for version in latest_version:
    print(f'version: {version.version}, stage: {version.current_stage}')

version: 1, stage: Production
version: 3, stage: Staging


In [37]:
model_version = 3
new_stage = 'Staging'

In [32]:

client.transition_model_version_stage(
    name = model_name,
    version = model_version,
    stage = new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685096503410, current_stage='Staging', description=None, last_updated_timestamp=1685114054356, name='nyc-taxi-regressor', run_id='3bcfa63a5c164e5da6e8a9c98b327ee4', run_link=None, source='file:///c:/Users/marcospp/Documents/MarcosPaulo/Estudos/MLOps-Zoomcamp/02-experiment-tracking/mlruns/1/3bcfa63a5c164e5da6e8a9c98b327ee4/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [41]:
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f'The model version {model_version} was transitioned to {new_stage} on {datetime.today()}'
)

<ModelVersion: aliases=[], creation_timestamp=1685096503410, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2023-05-26 16:22:23.124409', last_updated_timestamp=1685114543125, name='nyc-taxi-regressor', run_id='3bcfa63a5c164e5da6e8a9c98b327ee4', run_link=None, source='file:///c:/Users/marcospp/Documents/MarcosPaulo/Estudos/MLOps-Zoomcamp/02-experiment-tracking/mlruns/1/3bcfa63a5c164e5da6e8a9c98b327ee4/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [46]:
def read_dataframe(filename: str) -> pd.DataFrame:
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda duration: duration.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype(str)

    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [48]:
df = read_dataframe('../data/yellow_tripdata_2022-03.parquet')

In [50]:
# Download artifact
run_id='27c14687bceb401bad73fc5d2085d735'
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

  client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


'C:\\Users\\marcospp\\Documents\\MarcosPaulo\\Estudos\\MLOps-Zoomcamp\\02-experiment-tracking\\preprocessor'

In [52]:
with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)

In [53]:
X_test = preprocess(df, dv)

In [54]:
target = 'duration'
y_test = df[target].values

In [55]:
%time test_model(name=model_name, stage='Production', X_test=X_test, y_test=y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 7min 34s
Wall time: 1min 7s


{'rmse': 5.037930337985029}

In [61]:
%time test_model(name=model_name, stage='Staging', X_test=X_test, y_test=y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 3min 1s
Wall time: 24 s


{'rmse': 5.408967941450919}

In [63]:
# Simulando transição de stage
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage='Production',
    archive_existing_versions=True,
)

<ModelVersion: aliases=[], creation_timestamp=1684944978027, current_stage='Production', description='\n', last_updated_timestamp=1685117129555, name='nyc-taxi-regressor', run_id='27c14687bceb401bad73fc5d2085d735', run_link='', source='file:///c:/Users/marcospp/Documents/MarcosPaulo/Estudos/MLOps-Zoomcamp/02-experiment-tracking/mlruns/1/27c14687bceb401bad73fc5d2085d735/artifacts/models_mlflow', status='READY', status_message=None, tags={'model': 'xgboost-regressor'}, user_id=None, version=2>