In [3]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import mlflow

np.random.seed(0)

Load data from https://www.openml.org/d/40945

In [4]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

Use ColumnTransformer by selecting column by names

We will train our classifier with the following features:

Numeric Features:

    age: float;

    fare: float.

Categorical Features:

    embarked: categories encoded as strings {'C', 'S', 'Q'};

    sex: categories encoded as strings {'female', 'male'};

    pclass: ordinal integers {1, 2, 3}.

We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature.

In [3]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

Append classifier to preprocessing pipeline. Now we have a full prediction pipeline.

### Set a tracking server on localhost for the registry :      
       
       mlflow.set_tracking_uri("sqlite:///mlruns.db")

### Create an experiment (will be stored in mlruns directory  wherever you run the  code)      
       
       EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
       
       mlflow.set_experiment(EXPERIMENT_NAME)
For each different parameter of the model: 
    
    - train a different model (classifier):           
           clf.fit(X_train, y_train)
    
    - predict and get a score:        
           clf.score(X_test, y_test)
    
####         - start a run where to log parameters, metrics and model
           with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME)

In [7]:
params= [0.1, 1.0, 10, 100]

# Use sqlite:///mlruns.db as the local store for tracking and registery
mlflow.set_tracking_uri("sqlite:///mlruns.db")

#Set experiment as active if it exists or it will be created
EXPERIMENT_NAME = "titanic-exper1"
experiment = mlflow.set_experiment(EXPERIMENT_NAME)
EXPERIMENT_ID=experiment.experiment_id
        
for idx, param in enumerate(params):
    
    # train model with different parameters
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(C=param))]
        )
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    clf.fit(X_train, y_train)
    
    accuracy_score = clf.score(X_test, y_test)
    
    # logging runs with mlflow
    RUN_NAME=f"run_{idx}"
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id
        
        # mlflow log parameters
        mlflow.log_param("classifier_C",param)
        
        # mlflow log metrics
        mlflow.log_metric("pipeline_test_score",accuracy_score)
        
        # mlflow log model
        mlflow.sklearn.log_model(clf, "classifier_titanic")
        
        
        print("RUN_ID: ",RUN_ID," model score: %.3f" % accuracy_score)

2022/10/06 11:40:02 INFO mlflow.tracking.fluent: Experiment with name 'titanic-exper1' does not exist. Creating a new experiment.


RUN_ID:  52945918c0884114ae08126d7a36af0f  model score: 0.798
RUN_ID:  850db276b0d54c78b477aad9496c9aaf  model score: 0.790
RUN_ID:  196285b9a54541b28cde3541e8c489bd  model score: 0.790
RUN_ID:  bd3995237f7a48818a74e51f8cca0774  model score: 0.790


In [8]:
display(clf)

Using the prediction pipeline in a grid search

Grid search can also be performed on the different preprocessing steps defined in the ColumnTransformer object, together with the classifier’s hyperparameters as part of the Pipeline. We will search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression using GridSearchCV.

In [9]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "classifier__C": [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

Calling ‘fit’ triggers the cross-validated search for the best hyper-parameters combination:

Calling autolog before fitting model

#### mlflow.autolog(log_model_signatures=False)

whithout signature because we'll get this:  

WARNING mlflow.sklearn: Failed to infer model signature: Unable to map 'object' type to MLflow DataType. object canbe mapped iff all values have identical data type which is one of (string, (bytes or byterray),  int, float).

our raw data contains rows with object as type, which is not supported by ModelSignatures, then it is processed by the pipeline

In [10]:
from mlflow import MlflowClient
    
EXPERIMENT_NAME = "titanic-autolog-exper1"

experiment = mlflow.set_experiment(EXPERIMENT_NAME)
EXPERIMENT_ID=experiment.experiment_id

RUN_NAME="run_Auto_1"
with mlflow.start_run(experiment_id=EXPERIMENT_ID,run_name=RUN_NAME) as run_auto:
    
    mlflow.sklearn.autolog(log_model_signatures=False)
        
    grid_search.fit(X_train, y_train)

print("Best params:  ",grid_search.best_params_)




2022/10/06 11:42:11 INFO mlflow.tracking.fluent: Experiment with name 'titanic-autolog-exper1' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
           ...`
2022/10/06 11:42:27 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


Best params:   {'classifier__C': 0.1, 'preprocessor__num__imputer__strategy': 'mean'}


In [11]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.783


In [12]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_preprocessor__num__imputer__strategy",
        "param_classifier__C",
    ]
]

Unnamed: 0,mean_test_score,std_test_score,param_preprocessor__num__imputer__strategy,param_classifier__C
0,0.783223,0.038166,mean,0.1
1,0.781319,0.038468,median,0.1
2,0.78131,0.032086,mean,1.0
4,0.780357,0.032223,mean,10.0
6,0.780357,0.032223,mean,100.0
5,0.779396,0.030362,median,10.0
7,0.779396,0.030362,median,100.0
3,0.778434,0.029904,median,1.0


The best hyper-parameters have be used to re-fit a final model on the full training set. We can evaluate that final model on held out test data that was not used for hyperparameter tuning.

In [13]:
print(
    (
        "best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best logistic regression from grid search: 0.798


### Create a registry and register a model version 

In [38]:
RUN_ID = run_auto.info.run_id
EXPERIMENT_ID = experiment.experiment_id

client = MlflowClient()
#client.create_registered_model("gridSerchCV-titanic-reg-model-3")
result = client.create_model_version(
    name="gridSerchCV-titanic-reg-model-3",
    source=f"mlruns/{EXPERIMENT_ID}/{RUN_ID}/artifacts/best_estimator",
    run_id="{}".format(run_auto.info.run_id)
)

2022/10/06 15:01:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: gridSerchCV-titanic-reg-model-2, version 3


### Transitioning an MLflow Model’s Stage 

In [34]:
client.transition_model_version_stage(
    name="gridSerchCV-titanic-reg-model-3",
    version=1,
    stage="Staging"
)

<ModelVersion: creation_timestamp=1665060356622, current_stage='Staging', description=None, last_updated_timestamp=1665060818421, name='gridSerchCV-titanic-reg-model-3', run_id='4d5d7473287e415d9d8a85869eea49a3', run_link=None, source='mlruns/2/4d5d7473287e415d9d8a85869eea49a3/artifacts/best_estimator', status='READY', status_message=None, tags={}, user_id=None, version=1>

### Deleting MLflow Models

In [37]:
versions=[1, 2, 3]
for version in versions:
    client.delete_model_version(name="gridSerchCV-titanic-reg-model-3", version=version)

# Delete a registered model along with all its versions
client.delete_registered_model(name="gridSerchCV-titanic-reg-model-3")

###   Fetch a specific model version

Fetch the latest model version in a specific stage: stage(Production, Staging ..) instead of model_version

In [7]:
#import mlflow.pyfunc
#mlflow.set_tracking_uri("sqlite:///mlruns.db")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#data = X_test

model_name = "gridSerchCV-titanic-reg-model-2"
model_version = 1

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)


model.predict(data)

array(['0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0',
       '1', '1', '0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '1',
       '0', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0',
       '0', '1', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '1',
       '1', '0', '0', '0', '1', '1', '0', '0', '0', '1', '1', '0', '1',
       '1', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1',
       '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '1', '0',
       '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0',
       '1', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0',
       '0', '0', '0', '1', '0', '1', '0', '0', '1', '1', '1', '1