## Data
- We will be using NYC taxi data
- These have recently been changed to **parquet** files
- We will be using Pandas to read this data in via `pd.read_parquet()` command
    - This requires instaling PyArrow via `pip install pyarrow` on the VM
    - Also potentiall have to `pip install seaborn` and `pip install scikit-learn`
- Download the Green January and February 2021 parquet data file via `wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet` and `wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet`

In [1]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials  # metrics
from hyperopt.pyll import scope

from config import mlflow_model_uri

In [2]:
# FIRST START THE UI ON THE COMMAND LINE WHILE IN THE WEEK 2 DIRECTORY VIA
# !mlflow ui --backend-store-uri sqlite:///mlflow.db

# THEN OPEN THE UI AT http://127.0.0.1:5000

In [3]:
# set the MLFlow URI to our backend
mlflow.set_tracking_uri('sqlite:///mlflow.db')

# set up to assign/append runs to our experiment (and create if it doesn't exist)
mlflow.set_experiment('nyc_taxi_experiment_1')

<Experiment: artifact_location='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1', creation_time=1684358646144, experiment_id='1', last_update_time=1684358646144, lifecycle_stage='active', name='nyc_taxi_experiment_1', tags={}>

## 1. Load, inspect, and clean data

In [4]:
# create helper function to read and clean data
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        
        # do some data conversion if CSV
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    # create duration in minutes column
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    # filter to only trips between 1 minute and 1 hour
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # specify categorical input features and convert to String
    # for one-hot encoding via Dictionary Vectorizer
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
# create DataFrames for training and validation
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

# see how many samples we have
len(df_train), len(df_val)

(73908, 61921)

In [6]:
# create a feature combining pickup and drop-off locations (feature engineering)
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
# specify our input features
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

## 2. Create training and validation sets

In [8]:
# create training and validation input feature sets
dv = DictVectorizer()

# turn each row into dictionary and create the training set
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# turn each row into dictionary and create the validation set
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
# create training and validation label sets
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

## 3. MLFLow Model Management

**Add more models to further explore the MFLow Model Registry**

In [11]:
# # turn on autologging
# mlflow.sklearn.autolog()

# # loop through a set of models to train on the same data for future model comparison and registering
# for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):
    
#     # make sure each training session is an experiment run
#     with mlflow.start_run():
        
#         # log some stuff
#         mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
#         mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.parquet")
#         mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

#         # train the model
#         mlmodel = model_class()
#         print(f'Running model {mlmodel}')
        
#         print(f'Training model {mlmodel}...')
#         mlmodel.fit(X_train, y_train)

#         # make predictions
#         print(f'Making predictions with {mlmodel} model...')
#         y_pred = mlmodel.predict(X_val)
        
#         # get the error and log it
#         print(f'Getting error with {mlmodel} model...')
#         rmse = mean_squared_error(y_val, y_pred, squared=False)
#         mlflow.log_metric("rmse", rmse)

Running model RandomForestRegressor()
Training model RandomForestRegressor()...




Making predictions with RandomForestRegressor() model...
Gettint error with RandomForestRegressor() model...
Running model GradientBoostingRegressor()
Training model GradientBoostingRegressor()...
Making predictions with GradientBoostingRegressor() model...
Gettint error with GradientBoostingRegressor() model...
Running model ExtraTreesRegressor()
Training model ExtraTreesRegressor()...
Making predictions with ExtraTreesRegressor() model...
Gettint error with ExtraTreesRegressor() model...
Running model LinearSVR()
Training model LinearSVR()...




Making predictions with LinearSVR() model...
Gettint error with LinearSVR() model...


## 4. MLFlow Client

In [10]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

from datetime import datetime

import pandas as pd
from sklearn.metrics import mean_squared_error

import pickle

#### Interacting with the MLflow tracking server

The `MlflowClient` object allows us to interact with:
- An MLflow **Tracking Server** that creates and manages *experiments* and *runs*
- An MLflow **Registry Server** that creates and manages *registered models* and *model versions*

In [13]:
# To instantiate it we need to pass a tracking URI and/or a registry URI
# set the Tracking URI
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

# instantiate the object
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# list out all the experiments from the MLFlow Client as of right now
# client.list_experiments()  # DEPRECATED

# look for ALL (view_type=3) experiments
# https://mlflow.org/docs/latest/python_api/mlflow.client.html#mlflow.client.MlflowClient.search_experiments
client.search_experiments(view_type=3)

[<Experiment: artifact_location='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1', creation_time=1684358646144, experiment_id='1', last_update_time=1684358646144, lifecycle_stage='active', name='nyc_taxi_experiment_1', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1684357861329, experiment_id='0', last_update_time=1684357861329, lifecycle_stage='active', name='Default', tags={}>]

In [14]:
# create a new experiment
client.create_experiment(name="my-test-experiment")

'2'

**The above result is the ID of the new experiment, which you can see at http://127.0.0.1:5000/#/experiments/**

In [18]:
# check the latest AND BEST (in terms of RMSE) versions for the experiment with id = 1
# show 5 runs for this experiment ID, filtered for a specific RMSE range for
#   runs that are active, ordered by RMSE
runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [20]:
# the above returns a list of runs
runs[0]

<Run: data=<RunData: metrics={'rmse': 6.296879486713688}, params={'learning_rate': '0.13642747972651512',
 'max_depth': '21',
 'min_child_weight': '1.5655550191042376',
 'objective': 'reg:linear',
 'reg_alpha': '0.009403472263570046',
 'reg_lambda': '0.005322134643445022',
 'seed': '42'}, tags={'mlflow.log-model.history': '[{"run_id": "d4535c4d635f4305940cfbd71b0eb91a", '
                             '"artifact_path": "models_mlflow", '
                             '"utc_time_created": "2023-05-17 '
                             '23:18:09.589669", "flavors": {"python_function": '
                             '{"loader_module": "mlflow.xgboost", '
                             '"python_version": "3.9.16", "data": "model.xgb", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "xgboost": {"xgb_version": '
                             '"1.7.5", "data": "model.xgb", "model_class": '
                             '"

In [21]:
# print out some info for the runs, like the run ID and the RMSE of the run
for run in runs:
    print(f"Run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

Run id: d4535c4d635f4305940cfbd71b0eb91a, rmse: 6.2969
Run id: 445337b1adb94550ad6afc5e63987194, rmse: 6.2969
Run id: dcdc2317771246e2ab3f934f726f9e1f, rmse: 6.3052
Run id: bdec3badd6d040d686368344b9443db4, rmse: 6.3084
Run id: e679ce9c68034a44bf967cc8b084e25b, rmse: 6.3246


### 5. MLFlow Model Registry

We can use the `MlflowClient` instance to:
- Register a new version for an experiment that we will name `nyc-taxi-regressor`
- Retrieve the latest versions of the model `nyc-taxi-regressor` and check that a new version (v4) was created
- Transition model v4 to "Staging" and add annotations to it

In [22]:
# set the tracking URI (again) for MLFlow this time, not the Client
# or else it will look for the model in the local directory
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# take one of the above run ID's, get that model (via its URI), and 
#   register it as an `nyc-taxi-regressor`
model_name = 'nyc-taxi-regressor'
run_id = 'd4535c4d635f4305940cfbd71b0eb91a'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/28 19:49:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685317792828, current_stage='None', description=None, last_updated_timestamp=1685317792828, name='nyc-taxi-regressor', run_id='d4535c4d635f4305940cfbd71b0eb91a', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/d4535c4d635f4305940cfbd71b0eb91a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

#### After the above is run, go to http://127.0.0.1:5000/#/models to see that the model was indeed registered

In [26]:
# # check the list of registered models
# client.list_registered_models()  # DEPRECATED

In [23]:
# get the latest version of the model and look at what Stage each is in
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"Model version: {version.version}, Stage: {version.current_stage}")

Model version: 1, Stage: None


In [28]:
# select v1 of our model
model_version = 1
new_stage = "Staging"

# move the v1 model we just registered to Staging
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685317792828, current_stage='Staging', description=None, last_updated_timestamp=1685318225747, name='nyc-taxi-regressor', run_id='d4535c4d635f4305940cfbd71b0eb91a', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/d4535c4d635f4305940cfbd71b0eb91a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

#### Now refresh http://127.0.0.1:5000/#/models, and see that the v1 model is tagged as "Staging"

In [29]:
# update the v1 model to be in a new Stage and show what date that it happened on
date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1685317792828, current_stage='Staging', description='The model version 1 was transitioned to Staging on 2023-05-28', last_updated_timestamp=1685318708015, name='nyc-taxi-regressor', run_id='d4535c4d635f4305940cfbd71b0eb91a', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/d4535c4d635f4305940cfbd71b0eb91a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

#### Should now see a description at http://127.0.0.1:5000/#/models/nyc-taxi-regressor/versions/1

### 6. Comparing Versions to Selecting a New "Production" Model

Now, we will retrieve models registered in the model registry and compare their performance *on an unseen test set*
- The idea is to simulate the scenario in which a deployment engineer has to interact with the model registry to decide whether to update the model version that is in PROD or not

These are the steps:
- Load the test dataset (March 2021 NYC Green Taxi data)
- Download the `DictVectorizer` that was fitted using the training data and saved to MLflow as an artifact, and load it with `pickle`
- Preprocess the test set using the `DictVectorizer` so we can properly feed the regressors
- Make predictions on the test set using the model versions that are currently in the "Staging" and "Production" stages, and compare their performance
- Based on the results, update the "Production" model version accordingly.
- **NOTE**: *The model registry doesn't actually deploy the model to production when you transition a model to the "Production" stage*, it just **assigns a label** to that model version 
    - You should **complement the registry with some CI/CD code that does the actual deployment**

In [30]:
# first register some more models for comparisons
# set the tracking URI (again) for MLFlow this time, not the Client
# or else it will look for the model in the local directory
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# take one of the above run ID's, get that model (via its URI), and 
#   register it as an `nyc-taxi-regressor`
model_name = 'nyc-taxi-regressor'
run_id = 'dcdc2317771246e2ab3f934f726f9e1f'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name=model_name)

# take one of the above run ID's, get that model (via its URI), and 
#   register it as an `nyc-taxi-regressor`
model_name = 'nyc-taxi-regressor'
run_id = 'e679ce9c68034a44bf967cc8b084e25b'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/28 20:10:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 2
Created version '2' of model 'nyc-taxi-regressor'.
Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/28 20:10:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 3
Created version '3' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685319010516, current_stage='None', description=None, last_updated_timestamp=1685319010516, name='nyc-taxi-regressor', run_id='e679ce9c68034a44bf967cc8b084e25b', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/e679ce9c68034a44bf967cc8b084e25b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

#### Check http://127.0.0.1:5000/#/models/nyc-taxi-regressor to see the new model versions registered

In [31]:
# add that third one with the tied-for-lowest-RMSE
# take one of the above run ID's, get that model (via its URI), and 
#   register it as an `nyc-taxi-regressor`
model_name = 'nyc-taxi-regressor'
run_id = '445337b1adb94550ad6afc5e63987194'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/28 20:11:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 4
Created version '4' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685319094385, current_stage='None', description=None, last_updated_timestamp=1685319094385, name='nyc-taxi-regressor', run_id='445337b1adb94550ad6afc5e63987194', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/445337b1adb94550ad6afc5e63987194/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [32]:
# move the initial model to Production and the other 3 to staging
# select v1 of our model
model_version = 1
new_stage = "Production"

# move the v1 model we just registered to Production
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

# select v2 of our model
model_version = 2
new_stage = "Staging"

# move the v2 model we just registered to Staging
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

# select v3 of our model
model_version = 3
new_stage = "Staging"

# move the v1 model we just registered to Staging
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

# select v4 of our model
model_version = 4
new_stage = "Staging"

# move the v1 model we just registered to Staging
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685319094385, current_stage='Staging', description=None, last_updated_timestamp=1685319147333, name='nyc-taxi-regressor', run_id='445337b1adb94550ad6afc5e63987194', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/445337b1adb94550ad6afc5e63987194/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

#### Check http://127.0.0.1:5000/#/models/nyc-taxi-regressor to see that everything lines up. Then move on to creating the test set

In [100]:
# set up functions to read and preprocess the test set
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = str(df['PULocationID']) + '_' + str(df['DOLocationID'])
    
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    print(f'MODEL NAME: {name}')
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [34]:
# get the data
# Make the directory to hold the data if it doesn't exist
# https://www.tutorialspoint.com/python/os_makedirs.htm
os.makedirs(os.path.dirname('./data/'), mode=0o755, exist_ok=True)

# remove files if they are already there, then get files
# !rm ./data/green_tripdata_2021-03.parquet

# windows
# https://www.freecodecamp.org/news/how-to-check-if-a-file-exists-in-python/
if os.path.isfile('./data/green_tripdata_2021-03.parquet'):
    os.remove("./data/green_tripdata_2021-03.parquet")

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-03.parquet -P ./data

--2023-05-28 20:13:17--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-03.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.238.11.82, 18.238.11.65, 18.238.11.110, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.238.11.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474538 (1.4M) [binary/octet-stream]
Saving to: './data/green_tripdata_2021-03.parquet'

     0K .......... .......... .......... .......... ..........  3% 6.77M 0s
    50K .......... .......... .......... .......... ..........  6% 8.97M 0s
   100K .......... .......... .......... .......... .......... 10% 7.92M 0s
   150K .......... .......... .......... .......... .......... 13% 14.2M 0s
   200K .......... .......... .......... .......... .......... 17% 8.78M 0s
   250K .......... .......... .......... .......... .......... 20% 8.50M 0s
   300K .......... .......... .......... .......... .......... 24%

In [54]:
# read in the data parquet file
df = read_dataframe('./data/green_tripdata_2021-03.parquet')
# df.columns

In [85]:
# get the preprocessor from the best run
run_id = 'd4535c4d635f4305940cfbd71b0eb91a'

client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

  client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


'C:\\Users\\nimz\\Documents\\mlops_zoomcamp\\week2_experiment_tracking\\preprocessor'

In [56]:
# load in the preprocessor
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)
    
# preprocess the test data
X_test = preprocess(df, dv)

# set up the labels
target = "duration"
y_test = df[target].values

In [99]:
# run the model in PROD on the test set and see how long it takes
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

# mlflow.pyfunc.load_model(model_uri: str, suppress_warnings: bool = False, dst_path: Optional[str] = None) 

# from mlflow.tracking.artifact_utils import _download_artifact_from_uri

# def _download_artifact_from_uri(artifact_uri, output_path=None):
#     """
#     :param artifact_uri: The *absolute* URI of the artifact to download.
#     :param output_path: The local filesystem path to which to download the artifact. If unspecified,
#                         a local output path will be created.
#     """
#     root_uri, artifact_path = _get_root_uri_and_artifact_path(artifact_uri)
#     return get_artifact_repository(artifact_uri=root_uri).download_artifacts(
#         artifact_path=artifact_path, dst_path=output_path
#     )

MODEL NAME: nyc-taxi-regressor


OSError: No such file or directory: 'W:\nyc-taxi-regressor\Production'

In [107]:
model_name = 'nyc-taxi-regressor'
run_id = '03232258e2574763aeec2e8c6e6b36d9'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/31 17:18:53 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 5
Created version '5' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685567933583, current_stage='None', description=None, last_updated_timestamp=1685567933583, name='nyc-taxi-regressor', run_id='03232258e2574763aeec2e8c6e6b36d9', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/03232258e2574763aeec2e8c6e6b36d9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [108]:
# try the above on the next-best model, something about the directory on the best one is off

# archive the current one in the UI

# move v4 to Prod
model_version = 5
new_stage = ""

# move the v1 model we just registered to Staging
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685567933583, current_stage='Production', description=None, last_updated_timestamp=1685567953625, name='nyc-taxi-regressor', run_id='03232258e2574763aeec2e8c6e6b36d9', run_link=None, source='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1/03232258e2574763aeec2e8c6e6b36d9/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [109]:
# get the preprocessor from the best run
run_id = '03232258e2574763aeec2e8c6e6b36d9'

client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

# load in the preprocessor
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)
    
# preprocess the test data
X_test = preprocess(df, dv)

# set up the labels
target = "duration"
y_test = df[target].values

  client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


In [110]:
# run the model in PROD on the test set and see how long it takes
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

MODEL NAME: nyc-taxi-regressor


 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 500 ms
Wall time: 2.81 s


{'rmse': 6.733905988481068}

In [112]:
# # run the model in STAGING on the test set and see how long it takes
# %time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

In [113]:
# client.transition_model_version_stage(
#     name=model_name,
#     version=4,
#     stage="Production",
#     archive_existing_versions=True
# )