In [1]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from datetime import datetime

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, log_loss, f1_score
import pandas as pd
import numpy as np

import pickle
import os

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [2]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='prediction-001', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='prediction-002', tags={}>,
 <Experiment: artifact_location='./mlruns/5', experiment_id='5', lifecycle_stage='active', name='prediction-003', tags={}>]

In [4]:
client.create_experiment(name="registry")

'6'

In [5]:
runs = client.search_runs(
    experiment_ids='5', # prediction-003
    filter_string="metrics.loss_metric < 0.090",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    order_by=["metrics.loss_metric ASC"]
)

In [6]:
runs[0].info.run_id

'c04aab12c0ae4c5985025ebbe908ac75'

In [7]:
for run in runs:
    print(f"run id: {run.info.run_id}, loss_metric: {run.data.metrics['loss_metric']:.4f}")

run id: c04aab12c0ae4c5985025ebbe908ac75, loss_metric: 0.0852
run id: f38107682ffd4200a5822e6bceaa0ff5, loss_metric: 0.0852
run id: 4830e57742c5484797cfc0fce6b93d60, loss_metric: 0.0852
run id: da140e2116104b3d834c69ecff426976, loss_metric: 0.0852
run id: 163f2d4fb5bd41499a77b287e39dc8f1, loss_metric: 0.0855
run id: 07fea5af56624531a5f31e82b3b641f5, loss_metric: 0.0855
run id: 800c92eb1fff4ea2ba06c36615bfb6c7, loss_metric: 0.0856
run id: f913690687a04a4a93c4bdefa50dc12e, loss_metric: 0.0857
run id: 73bf7e1c379749b0addab61c1459f97b, loss_metric: 0.0860
run id: cad786ee565c45e5bb529ec8d1951404, loss_metric: 0.0861


In [29]:
runs[0].data.params

{'colsample_bytree': '0.32',
 'gamma': '0.48',
 'learning_rate': '0.32',
 'max_depth': '7.600000000000005',
 'min_child_weight': '4.2',
 'n_estimators': '45',
 'subsample': '0.96',
 'seed': '42'}

In [30]:
run_id = "c04aab12c0ae4c5985025ebbe908ac75"
model_uri = f"runs:/{run_id}/models_mlflow"

In [31]:
mlflow.register_model(model_uri=model_uri, name="model2")

Registered model 'model2' already exists. Creating a new version of this model...
2022/08/10 20:22:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: model2, version 2
Created version '2' of model 'model2'.


<ModelVersion: creation_timestamp=1660184554770, current_stage='None', description=None, last_updated_timestamp=1660184554770, name='model2', run_id='c04aab12c0ae4c5985025ebbe908ac75', run_link=None, source='./mlruns/5/c04aab12c0ae4c5985025ebbe908ac75/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [32]:
model_uri

'runs:/c04aab12c0ae4c5985025ebbe908ac75/models_mlflow'

## Transitioning a model

In [33]:
model_name = "model2"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging
version: 2, stage: None


In [34]:
model_version = 2
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: creation_timestamp=1660184554770, current_stage='Staging', description=None, last_updated_timestamp=1660184570743, name='model2', run_id='c04aab12c0ae4c5985025ebbe908ac75', run_link=None, source='./mlruns/5/c04aab12c0ae4c5985025ebbe908ac75/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [35]:
model_version = 2
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: creation_timestamp=1660184554770, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2022-08-10', last_updated_timestamp=1660184574846, name='model2', run_id='c04aab12c0ae4c5985025ebbe908ac75', run_link=None, source='./mlruns/5/c04aab12c0ae4c5985025ebbe908ac75/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [36]:
def read_data(filepath):

    df = pd.read_csv(filepath)

    df.drop(['nativeCountry'], axis=1, inplace=True)

    target = 'incomeTarget'

    transformed_target = []

    for _, value in df['incomeTarget'].iteritems():
        if value == ' <=50K':
            transformed_target.append(0)
        else:
            transformed_target.append(1)
    df['incomeTarget'] = transformed_target

    y = df[target]
    X = df.drop('incomeTarget', axis=1, inplace=True)
    X = pd.get_dummies(df)

    # Upsample using SMOTE
    sm = SMOTE(random_state=12)
    X_train_sm, y_train_sm = sm.fit_resample(X, y)

    df_new = pd.DataFrame(X_train_sm, columns=X.columns)

    return df_new, y_train_sm


def scale_data(df: pd.DataFrame, scaler: StandardScaler, fit_scaler: bool = False):
    if fit_scaler:
        X = scaler.fit_transform(df)
    X = scaler.transform(df)
    return pd.DataFrame(X, columns=df.columns)


def preprocess_data(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    dicts = df.to_dict(orient='records')

    if fit_dv:
        df = dv.fit_transform(dicts)
    df = dv.transform(dicts)

    return df, dv

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(name)
    y_pred = model.predict(X_test)
    return {"f1_score": f1_score(y_test, y_pred)}

In [37]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

'/home/ovokpus/Income-Prediction-Pipeline/phase-02-experiment-tracking/preprocessor'

In [38]:
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [39]:
X_train, y_train = read_data("../data/adult-train.csv")
X_test, y_test = read_data("../data/adult-test.csv")

In [40]:
%time test_model(name=model_uri, stage="Staging", X_test=X_test, y_test=y_test)

CPU times: user 183 ms, sys: 0 ns, total: 183 ms
Wall time: 63.3 ms


{'f1_score': 0.6725941989699106}

In [41]:
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production",
    archive_existing_versions=True
)

<ModelVersion: creation_timestamp=1660184554770, current_stage='Production', description='The model version 2 was transitioned to Staging on 2022-08-10', last_updated_timestamp=1660184680918, name='model2', run_id='c04aab12c0ae4c5985025ebbe908ac75', run_link=None, source='./mlruns/5/c04aab12c0ae4c5985025ebbe908ac75/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=2>