In [27]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from datetime import datetime

from sklearn.utils import resample
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, log_loss
import pandas as pd
import numpy as np

import pickle
import os

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [2]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/4', experiment_id='4', lifecycle_stage='active', name='prediction-001', tags={}>,
 <Experiment: artifact_location='./mlruns/5', experiment_id='5', lifecycle_stage='active', name='prediction-002', tags={}>,
 <Experiment: artifact_location='./mlruns/6', experiment_id='6', lifecycle_stage='active', name='prediction-003', tags={}>,
 <Experiment: artifact_location='./mlruns/7', experiment_id='7', lifecycle_stage='active', name='prediction-004', tags={}>,
 <Experiment: artifact_location='./mlruns/8', experiment_id='8', lifecycle_stage='active', name='prediction-005', tags={}>,
 <Experiment: artifact_location='./mlruns/9', experiment_id='9', lifecycle_stage='active', name='prediction-006', tags={}>,
 <Experiment: artifact_location='./mlruns/10', experiment_id='10', lifecycle_stage='active', name='prediction-007', tags={}>]

In [3]:
client.create_experiment(name="model-registry")

'11'

In [6]:
runs = client.search_runs(
    experiment_ids='10', # nyc_taxi_experiment
    filter_string="metrics.loss_metric < 0.16",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    order_by=["metrics.loss_metric ASC"]
)

In [7]:
runs[0].info.run_id

'0a274d15c1e74c49be45d3a16c36f827'

In [8]:
for run in runs:
    print(f"run id: {run.info.run_id}, loss_metric: {run.data.metrics['loss_metric']:.4f}")

run id: 0a274d15c1e74c49be45d3a16c36f827, loss_metric: 0.1500
run id: 33493216395247d996488244771aa528, loss_metric: 0.1500
run id: 7f42d99424c042bcb7ba0263be10c868, loss_metric: 0.1502
run id: d9ee1514fc6a49ce92025acfbb7ce8c9, loss_metric: 0.1503
run id: 91d846959a3d45638de633390dcc90d9, loss_metric: 0.1503
run id: 89e02dd7dab84d179bde271a10c3b127, loss_metric: 0.1506
run id: 17d02df59ee34841851d98322d39df23, loss_metric: 0.1507
run id: ae336b5df8774658903458d5c93b26e9, loss_metric: 0.1507
run id: e006bcec05c74923b0bd453518367a18, loss_metric: 0.1507
run id: d342925c35db4bf48d8c3dda32778239, loss_metric: 0.1508


In [9]:
runs[0].data.params

{'colsample_bytree': '0.53',
 'gamma': '0.33',
 'learning_rate': '0.23',
 'max_depth': '5.9',
 'min_child_weight': '1.6',
 'n_estimators': '45',
 'subsample': '0.93',
 'seed': '42'}

In [10]:
run_id = "0a274d15c1e74c49be45d3a16c36f827"
model_uri = f"runs:/{run_id}/model"

In [11]:
mlflow.register_model(model_uri=model_uri, name="income-prediction")

Registered model 'income-prediction' already exists. Creating a new version of this model...
2022/08/09 22:16:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: income-prediction, version 2
Created version '2' of model 'income-prediction'.


<ModelVersion: creation_timestamp=1660104978404, current_stage='None', description=None, last_updated_timestamp=1660104978404, name='income-prediction', run_id='0a274d15c1e74c49be45d3a16c36f827', run_link=None, source='./mlruns/10/0a274d15c1e74c49be45d3a16c36f827/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [12]:
model_uri

'runs:/0a274d15c1e74c49be45d3a16c36f827/model'

## Transitioning a model

In [13]:
model_name = "income-prediction"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging
version: 2, stage: None


In [17]:
model_version = 2
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: creation_timestamp=1660104978404, current_stage='Staging', description=None, last_updated_timestamp=1660105109022, name='income-prediction', run_id='0a274d15c1e74c49be45d3a16c36f827', run_link=None, source='./mlruns/10/0a274d15c1e74c49be45d3a16c36f827/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [18]:
model_version = 2
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: creation_timestamp=1660104978404, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2022-08-09', last_updated_timestamp=1660105123835, name='income-prediction', run_id='0a274d15c1e74c49be45d3a16c36f827', run_link=None, source='./mlruns/10/0a274d15c1e74c49be45d3a16c36f827/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [19]:
def read_data(filepath):
    columns = ['age', 'workClass', 'financialWeight', 'education', 'educationNum', 'maritalStatus', 'occupation',
               'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'incomeTarget']

    target = 'incomeTarget'

    df = pd.read_csv(filepath, names=columns)
    transformed_target = []

    for _, value in df['incomeTarget'].iteritems():
        if value == ' <=50K':
            transformed_target.append(0)
        else:
            transformed_target.append(1)
    df['incomeTarget'] = transformed_target

    df.drop('nativeCountry', axis=1, inplace=True)

    y = df[target]
    X = df.drop('incomeTarget', axis=1, inplace=True)
    X = pd.get_dummies(df)

    # Upsampling
    X_upsampled, y_upsampled = resample(X[y == 1],
                                        y[y == 1],
                                        replace=True,
                                        n_samples=X[y == 0].shape[0],
                                        random_state=1)

    X_upsampled = np.concatenate((X[y == 0], X_upsampled))
    y_upsampled = np.concatenate((y[y == 0], y_upsampled))

    df_new = pd.DataFrame(X_upsampled, columns=X.columns)

    return df_new, y_upsampled


def scale_data(df: pd.DataFrame, scaler: StandardScaler, fit_scaler: bool = False):
    if fit_scaler:
        X = scaler.fit_transform(df)
    X = scaler.transform(df)
    return pd.DataFrame(X, columns=df.columns)


def preprocess_data(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    dicts = df.to_dict(orient='records')

    if fit_dv:
        df = dv.fit_transform(dicts)
    df = dv.transform(dicts)

    return df, dv

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"auc": auc(y_test, y_pred)}

In [21]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

'/home/ovokpus/Income-Prediction-Pipeline/phase-02-experiment-tracking/preprocessor'

In [23]:
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [28]:
X_train, y_train = read_data(os.path.join(raw_data_path, 'adult-train.csv'))
X_val, y_val = read_data(os.path.join(raw_data_path, 'adult-test.csv'))

NameError: name 'raw_data_path' is not defined