In [26]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

from sklearn.metrics import mean_squared_error

import mlflow
mlflow.sklearn.autolog()

In [8]:
os.chdir("/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/02-training")

In [9]:
tracking_uri = "sqlite:///mlflow.sqlite.db"

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("nyc-taxi-duration-experiment-0528")

<Experiment: artifact_location='/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/02-training/mlruns/2', creation_time=1685279313985, experiment_id='2', last_update_time=1685279313985, lifecycle_stage='active', name='nyc-taxi-duration-experiment-0528', tags={}>

In [159]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    # creating target vector
    y = df["duration"].values

    # create feature matrix
    df.drop("duration", axis = 1)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts), y




In [12]:
train_data_path = "/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/01-intro/data/train/green_tripdata_2021-01.parquet"
val_data_path = "/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/01-intro/data/val/green_tripdata_2021-02.parquet"

df_train = read_dataframe(train_data_path)
df_val = read_dataframe(val_data_path)

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [71]:
for model_class in (LinearRegression, DecisionTreeRegressor, RandomForestRegressor,
                     GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run(experiment_id= '2'):
        mlflow.log_params({"train_data": train_data_path,
                        "validation_data": val_data_path})
        mlflow.set_tags({"model": model_class, "developer": "Saheli"})
        mlflow.log_artifact(local_path= "models/preprocessor.b", 
                            artifact_path= "preprocessor")
        
        model = model_class()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared= False)
        mlflow.log_metric("rmse", rmse)




## Create experiment and train models using MLflowClient

In [35]:
from mlflow import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.sqlite.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [75]:
## Creating experiment using MLflow client
experiment_id = client.create_experiment(
                                            "nyc-taxi-duration-0528-v2",
                                            artifact_location= "./mlruns/",
                                            tags = {"version": "v1", "priority": "P1"}
                                        )

client.set_experiment_tag(experiment_id, "regression.framework", "Duration prediction")

# Fetch experiment metadata information
experiment = client.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: nyc-taxi-duration-0528-v2
Experiment_id: 5
Artifact Location: /Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/02-training/mlruns
Tags: {'version': 'v1', 'priority': 'P1', 'regression.framework': 'Duration prediction'}
Lifecycle_stage: active


In [77]:

for model_class in (LinearRegression, DecisionTreeRegressor):

    with mlflow.start_run(experiment_id= experiment_id):
        
        mlflow.log_params({"train_data": train_data_path,
                        "validation_data": val_data_path})
        mlflow.set_tags({"model": model_class, "developer": "Saheli"})
        mlflow.log_artifact(local_path= "models/preprocessor.b", 
                            artifact_path= "preprocessor")
        
        model = model_class()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared= False)
        mlflow.log_metric("validation-rmse", rmse)


In [105]:
from mlflow.entities import ViewType

runs = client.search_runs(
                            experiment_ids= ['2'],
                            filter_string= "metric.rmse < 7 and tags.developer = 'Saheli'",
                            run_view_type= ViewType.ACTIVE_ONLY,
                            max_results= 5,
                            order_by= ["metric.rmse ASC"]
                        )

for run in runs:
    print(f"run_id : {run.info.run_id}, model : {run.data.tags['estimator_name']}, rmse : {run.data.metrics['rmse']:.4f}")



run_id : f023a5ae4210482dbfc9e3f2d175879e, model : GradientBoostingRegressor, rmse : 6.7423
run_id : 3aafc3e2048f4633ba8b00bcf4996d4f, model : RandomForestRegressor, rmse : 6.8993
run_id : 6019128a3a054f60b7e4a537aed45f06, model : ExtraTreesRegressor, rmse : 6.9410


## Register models

In [106]:
run_id = "f023a5ae4210482dbfc9e3f2d175879e"
model_uri = f'runs:/{run_id}/model'
version_name = "GradientBoostingRegressor"

mlflow.register_model(model_uri, 
                      name = "NYC-Ride-Duration-Regressor", 
                      tags = {"model": version_name})

Registered model 'NYC-Ride-Duration-Regressor' already exists. Creating a new version of this model...
2023/05/29 00:23:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: NYC-Ride-Duration-Regressor, version 5
Created version '5' of model 'NYC-Ride-Duration-Regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685300010974, current_stage='None', description=None, last_updated_timestamp=1685300010974, name='NYC-Ride-Duration-Regressor', run_id='f023a5ae4210482dbfc9e3f2d175879e', run_link=None, source='/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/02-training/mlruns/2/f023a5ae4210482dbfc9e3f2d175879e/artifacts/model', status='READY', status_message=None, tags={'model': 'GradientBoostingRegressor'}, user_id=None, version=5>

### Register model using MLflowClient

In [109]:
client.create_registered_model("NYC-Ride-Duration-GBMRegressor")


result = client.create_model_version(
    name = "NYC-Ride-Duration-GBMRegressor",
    source = model_uri,
    run_id = run_id,
    description= "GradientBoosting regression model"
)

## delete registered model
# client.delete_registered_model(name="NYC-Ride-Duration-GBMRegressor")

2023/05/29 00:32:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: NYC-Ride-Duration-GBMRegressor, version 1


## Stage models in registry

In [126]:
model_version = 6
new_stage = "staging"

client.transition_model_version_stage(
                                        name = model_name,
                                        version= model_version,
                                        stage = new_stage,
                                        archive_existing_versions=False
                                    )

from datetime import date
client.update_model_version(
                                name = model_name,
                                version= model_version,
                                description= f"This model version {model_version} was trasitioned to {new_stage} on {date.today()}"
                            )

<ModelVersion: aliases=[], creation_timestamp=1685301982376, current_stage='Staging', description='This model version 6 was trasitioned to staging on 2023-05-29', last_updated_timestamp=1685302000096, name='NYC-Ride-Duration-Regressor', run_id='12feba662000405f9284453a12bf67f6', run_link='', source='/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/02-training/mlruns/2/12feba662000405f9284453a12bf67f6/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=6>

In [None]:
model_name = "NYC-Ride-Duration-Regressor"

latest_versions = client.get_latest_versions(name = model_name)

for version in latest_versions:
    print(f"version: {version.version}, run_id: {version.run_id}, stage: {version.current_stage}")

## Prediction using registered model

In [146]:

latest_versions = client.get_latest_versions(name = model_name)

prod_run_id = ''
prod_version = ''
prod_artifacts_path = ''

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")
    if version.current_stage == "Production":
        prod_run_id = version.run_id
        prod_version = version.version
        prod_artifacts_path = version.source.rsplit("/", 1)[0]


version: 2, stage: Archived
version: 6, stage: Staging
version: 4, stage: Production


In [167]:
def test_model(X_test, y_test, name, stage):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [161]:
prod_preprocess_uri = prod_artifacts_path + "/preprocessor/preprocessor.b"

with open(prod_preprocess_uri, 'rb') as f:
    dv = pickle.load(f)


test_data_path = "/Users/sahelimukherjee/Documents/Personal/Learning/MLOps/projects/NYC_Ride_Duration_Prediction/Predicting-Ride-Duration/01-intro/data/test/green_tripdata_2021-03.parquet"

df = read_dataframe(test_data_path)
X_test, y_test = preprocess(df, dv)

In [170]:
%time test_model(X_test, y_test, name = model_name, stage = "Production")

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 121 ms, sys: 4.93 ms, total: 126 ms
Wall time: 125 ms


{'rmse': 7.403930760220898}