# MLflow: model registry

In [None]:
import pandas as pd
import sklearn.pipeline as pipeline
import sklearn.compose as compose
import sklearn.preprocessing as preprocessing
import sklearn.linear_model as linear_model
import sklearn.model_selection as model_selection 
import sklearn.metrics as metrics
import mlflow
import mlflow.tracking as tracking
import mlflow.sklearn
import numpy as np

In [None]:
MLFLOW_EXPERIMENT = "Model registy demo"

## Read in data

In [None]:
df = pd.read_csv("../data/processed/train.csv")

Create features and target.

In [None]:
X = df.drop("Attrition", axis=1)
y = df.loc[:, "Attrition"]

## Preprocessing pipeline

In [None]:
categorical_cols = []
numerical_cols = []

for colname, coltype in X.dtypes.items():
    if coltype == "object":
        categorical_cols.append(colname)
    # In our data there are only 2 types
    else:
        numerical_cols.append(colname)
    
print(f"Categorical: {', '.join(categorical_cols)}")
print(f"Numerical: {', '.join(numerical_cols)}")

In [None]:
scaler = preprocessing.StandardScaler()
oh_encoder = preprocessing.OneHotEncoder(categories="auto", drop="first")

preprocess_pipeline = compose.ColumnTransformer(transformers=[ # (name, transformer, column(s))
    ("scaler", scaler, numerical_cols),
    ("one_hot_encode", oh_encoder, categorical_cols)
], remainder="drop")

Fit the pipeline.

In [None]:
X = preprocess_pipeline.fit_transform(X)

Get columns names.

In [None]:
oh_encoded_categorical_colnames = preprocess_pipeline.named_transformers_["one_hot_encode"].get_feature_names().tolist()

transformed_colnames = numerical_cols + oh_encoded_categorical_colnames

Make target binary.

In [None]:
y = y.apply(lambda y: 1 if y == "Yes" else 0)

## Train a model

### Set up MLflow experiment

Check if an experiment already exists.

In [None]:
client = tracking.MlflowClient()
experiments = client.list_experiments()
if MLFLOW_EXPERIMENT in [e.name for e in experiments]:
    print("Experiment already exists.")
else:
    print("Experiment does not exist. Creating it.")
    mlflow.create_experiment(MLFLOW_EXPERIMENT)

Set experiment.

In [None]:
mlflow.set_experiment(MLFLOW_EXPERIMENT)

### Train a model and save it

In [None]:
alpha = 0.1
l1_ratio = 0.5

with mlflow.start_run():
    # Define model
    model = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    # Train a model
    model.fit(X, y)

    # Compute CV scores
    scores = model_selection.cross_validate(
        model, 
        X, 
        y, 
        cv=5, 
        scoring={
            "roc_auc": metrics.make_scorer(metrics.roc_auc_score),
            "log_loss": metrics.make_scorer(metrics.log_loss)
        }
    )

    # Log paramas
    mlflow.log_params({
        "alpha": alpha,
        "l1_ratio": l1_ratio
    })            

    # Log metrics
    mlflow.log_metrics({
        "val_roc_auc": scores["test_roc_auc"].mean(),
        "val_log_loss": scores["test_log_loss"].mean(),
    })

    # Log model tags, e.g., model type
    mlflow.set_tags({
        "model": "ElasticNet"
    })

    # Log model itself
    mlflow.sklearn.log_model(model, artifact_path="model")

    # Log preprocessing pipeline
    mlflow.sklearn.log_model(preprocess_pipeline, artifact_path="preprocess_pipeline")

Now go to MLflow and register both the pipeline and the model as `Pipeline` and `Model`.

## Load registered model and predict

Load test data.

In [None]:
df_test = pd.read_csv("../data/processed/test.csv")
X_test = df_test.drop("Attrition", axis=1)
y_test = df_test.loc[:, "Attrition"]
y_test = y_test.apply(lambda y: 1 if y == "Yes" else 0)

Load pipeline and model

In [None]:
del model, preprocess_pipeline  # just showing that it's not in the memory

In [None]:
pipeline_uri = client.get_model_version_details(name="Pipeline", version=1).source
preprocess_pipeline = mlflow.sklearn.load_model(pipeline_uri)

model_uri = client.get_model_version_details(name="Model", version=1).source
model = mlflow.sklearn.load_model(model_uri)

Make predictions

In [None]:
y_hat = model.predict(preprocess_pipeline.transform(X_test))
print(f"ROC AUC:  {metrics.roc_auc_score(y_test, y_hat):.3f}")
print(f"Log loss: {metrics.log_loss(y_test, y_hat):.3f}")

## Train another model

In [None]:
alpha = 0.1
l1_ratio = 0.3

with mlflow.start_run():
    # Define model
    model = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    # Train a model
    model.fit(X, y)

    # Compute CV scores
    scores = model_selection.cross_validate(
        model, 
        X, 
        y, 
        cv=5, 
        scoring={
            "roc_auc": metrics.make_scorer(metrics.roc_auc_score),
            "log_loss": metrics.make_scorer(metrics.log_loss)
        }
    )

    # Log paramas
    mlflow.log_params({
        "alpha": alpha,
        "l1_ratio": l1_ratio
    })            

    # Log metrics
    mlflow.log_metrics({
        "val_roc_auc": scores["test_roc_auc"].mean(),
        "val_log_loss": scores["test_log_loss"].mean(),
    })

    # Log model tags, e.g., model type
    mlflow.set_tags({
        "model": "ElasticNet"
    })

    # Log model itself
    mlflow.sklearn.log_model(model, artifact_path="model")

    # Log preprocessing pipeline
    mlflow.sklearn.log_model(preprocess_pipeline, artifact_path="preprocess_pipeline")

Go and register a new model (the pipeline is the same).

## Load registered model and predict

Load pipeline and model

In [None]:
del model, preprocess_pipeline  # just showing that it's not in the memory

**Note that it is version 2 for the model now.**

In [None]:
pipeline_uri = client.get_model_version_details(name="Pipeline", version=1).source
preprocess_pipeline = mlflow.sklearn.load_model(pipeline_uri)

model_uri = client.get_model_version_details(name="Model", version=2).source
model = mlflow.sklearn.load_model(model_uri)

Make predictions

In [None]:
y_hat = model.predict(preprocess_pipeline.transform(X_test))
print(f"ROC AUC:  {metrics.roc_auc_score(y_test, y_hat):.3f}")
print(f"Log loss: {metrics.log_loss(y_test, y_hat):.3f}")