In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import pickle
import mlflow

In [2]:
mlflow.set_tracking_uri('sqlite:///mlflow.db') # set the database
mlflow.set_experiment('my-brand-new-experiment') # create the experiment if not exists

In [3]:
def read_dataframe(path):
  df = pd.read_parquet(path)
  df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
  df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
  df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.seconds / 60

  df = df[((df.duration >=1) & (df.duration <=60))]
  return df

In [4]:
df_train = read_dataframe('green_tripdata_2021-01.parquet')
df_val = read_dataframe('green_tripdata_2021-02.parquet')

In [5]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

valid_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(valid_dicts)

dv_path = 'models/preprocessor.b'
with open(dv_path, 'wb') as f_out:
    pickle.dump(dv, f_out)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
#from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)



Model Registry

In [8]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

Interacting with the MLflow tracking server
The MlflowClient object allows us to interact with...

an MLflow Tracking Server that creates and manages experiments and runs.
an MLflow Registry Server that creates and manages registered models and model versions.
To instantiate it we need to pass a tracking URI and/or a registry URI

In [9]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='my-brand-new-experiment', tags={}>]

Checking the latest versions for the experiment with id 1:

In [11]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    #filter_string="metrics.rmse < 7",
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [13]:
for run in runs:
    try:
        print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")
    except:
        pass

run id: 93fe37ad1a1d4c7ea68bad783ebe595f, rmse: 6.3872
run id: dedf5c1891ae4c748a79c227a3add23c, rmse: 6.5865
run id: e1a420289c3d4b83bb724cc44b1db35d, rmse: 6.8434
run id: 629324bc2f9840b3a9677a9acd3ef5b6, rmse: 953.1990


### Interacting with the Model Registry

Interacting with the Model Registry
In this section We will use the MlflowClient instance to:

- Register a new version for the experiment nyc-taxi-regressor
- Retrieve the latests versions of the model nyc-taxi-regressor and check that a new version 4 was created.
- Transition the version 4 to "Staging" and adding annotations to it.

In [16]:
run_id = "e1a420289c3d4b83bb724cc44b1db35d"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2022/05/24 18:43:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyc-taxi-regressor, version 3
Created version '3' of model 'nyc-taxi-regressor'.


<ModelVersion: creation_timestamp=1653428617471, current_stage='None', description=None, last_updated_timestamp=1653428617471, name='nyc-taxi-regressor', run_id='e1a420289c3d4b83bb724cc44b1db35d', run_link=None, source='./mlruns/1/e1a420289c3d4b83bb724cc44b1db35d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [32]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Production
version: 3, stage: Staging


In [31]:
model_version = 3
new_stage = "Staging"
#new_stage = "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: creation_timestamp=1653428617471, current_stage='Staging', description=None, last_updated_timestamp=1653428835352, name='nyc-taxi-regressor', run_id='e1a420289c3d4b83bb724cc44b1db35d', run_link=None, source='./mlruns/1/e1a420289c3d4b83bb724cc44b1db35d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [34]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: creation_timestamp=1653428617471, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2022-05-24', last_updated_timestamp=1653428869617, name='nyc-taxi-regressor', run_id='e1a420289c3d4b83bb724cc44b1db35d', run_link=None, source='./mlruns/1/e1a420289c3d4b83bb724cc44b1db35d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

Note: the model registry doesn't actually deploy the model to production when you transition a model to the "Production" stage, it just assign a label to that model version. You should complement the registry with some CI/CD code that does the actual deployment.

In [38]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    #df = pd.read_csv(filename)

    df = pd.read_parquet(filename)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [39]:
#df = read_dataframe("data/green_tripdata_2021-03.csv")
df = read_dataframe("green_tripdata_2021-02.parquet")

In [40]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

'C:\\Users\\Usuario\\Desktop\\ML Zoomcamp\\2.ML Flow\\preprocessor'

In [41]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [42]:
X_test = preprocess(df, dv)

In [43]:
target = "duration"
y_test = df[target].values

In [44]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

Wall time: 4.9 s


{'rmse': 7.648698200870478}

In [45]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

Wall time: 10.2 s


{'rmse': 8.8022433874608}