**1. Start Local MLflow tracking server:** <br> `mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns`

The MLflow model registry does not work unless your metadata is stored in a SQL database

extra parameters: <br>
`--host 0.0.0.0 -p 5000 --gunicorn-opts "--timeout 180"`

`backend-store-uri` represents the location and type of database we want to use to store high level metadata associated with our runs. <br> `default-artifact-root` specifies a separate path where artifacts should be stored. A separate path is provided for artifacts because artifacts can be very large and therefore may need to be stored in a cloud-based data store such as S3 for some projects. 

**2. Visit MLflow UI url:** <br> `http://127.0.0.1:5000` <br>
There, only the Default experiment will exist.

**3. Set Tracking URI:** <br>
`mlflow.set_tracking_uri('http://127.0.0.1:5000')` <br>
A very important step to tell MLflow where the model tracking server is.

**4. Create experiment or use existing one**: <br>
`mlflow.set_experiment(_experiment_name)`

**5. Track things:** <br>
E.g. `mlflow.log_metrics(metrics)`

**6. Save model on MLflow Model Registry:** <br>
`model_uri = mlflow.get_artifact_uri("logistic_regression_model")` <br>
`MODEL_NAME = "logistic_regression_model"` <br>
`mv = mlflow.register_model(model_uri, MODEL_NAME)`

**7. Load a model and predict:** <br>
`mlflow.set_tracking_uri('http://127.0.0.1:5000')` <br>
`_model_name = "logistic_regression_model"` <br>
`version = "4"` <br>
`my_clf = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}/{version}")`

In [1]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    RocCurveDisplay
)

from feature_engine.encoding import (
    OrdinalEncoder,
    OneHotEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 25)

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../data/input_data/telco_customer_churn_1.csv")
data.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [3]:
# replace NaNs of TotalCharges with '-1' and covert col to float (from string)
data['TotalCharges'] = data['TotalCharges'].str.replace(' ', '-1').astype(float)

# Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['customerID', 'Churn'], axis=1),
    data['Churn'],
    test_size=0.2,
    random_state=0,
)

# Categorical Features

In [5]:
cat_vars_onehot = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_vars_ordinal_arbitrary = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)
ordinal_encoder_arbitrary.fit(X_train, y_train)

onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)
onehot_encoder.fit(X_train)

X_train = ordinal_encoder_arbitrary.transform(X_train)
X_test = ordinal_encoder_arbitrary.transform(X_test)

X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# Numerical Features

In [6]:
num_vars_yeo_johnson = ['TotalCharges']

yeo_transformer = YeoJohnsonTransformer(variables=num_vars_yeo_johnson)
yeo_transformer.fit(X_train)

X_train = yeo_transformer.transform(X_train)
X_test = yeo_transformer.transform(X_test)

# Target

In [7]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Scaling

In [8]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train) 

X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_train.columns)

# Oversampling with SMOTE

In [9]:
print(f"Size of X_train before oversampling: {X_train.shape[0]}")
X_train, y_train = SMOTE(random_state=0).fit_resample(X_train, y_train)
print(f"Size of X_train after oversampling: {X_train.shape[0]}")

Size of X_train before oversampling: 4800
Size of X_train after oversampling: 7050


# ML

In [10]:
# MLflow: model name
MODEL_NAME = "dev_logreg_model"

# MLflow: tell MLflow where the model tracking server is
# The following command should be a comment if run by docker-compose
# mlflow.set_tracking_uri('http://127.0.0.1:5000')

# MLflow: experiment name
_experiment_name = "dev-churn-prediction-1"
mlflow.set_experiment(_experiment_name)

2023/08/01 04:36:05 INFO mlflow.tracking.fluent: Experiment with name 'dev-churn-prediction-1' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://datapath-mlops-l3/1', creation_time=1690864565157, experiment_id='1', last_update_time=1690864565157, lifecycle_stage='active', name='dev-churn-prediction-1', tags={}>

In [11]:
with mlflow.start_run() as run:

    # MLflow: print run specific info
    run_id = run.info.run_id
    print(f"\nActive run_id: {run_id}")

    # Choose parameters
    param_C = 0.8
    param_max_iter = 200
    clf = LogisticRegression(C=param_C, max_iter=param_max_iter, random_state=0)

    # Train on all set and evaluate on test
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # Cross validation on train set
    cv_accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro').mean()

    print(f"CV accuracy: {cv_accuracy:.2f}, Test accuracy: {test_accuracy:.2f}\n"
          f"CV f1: {cv_f1:.2f}, Test f1: {test_f1:.2f}")

    # MLflow: Log the metrics
    metrics = {"cv_accuracy": cv_accuracy, "cv_f1": cv_f1, "test_accuracy": test_accuracy, "test_f1": test_f1}
    mlflow.log_metrics(metrics)

    # MLflow: Log the parameters
    params = {"C": param_C, "max_iter": param_max_iter}
    mlflow.log_params(params)

    # MLflow log the model
    mlflow.sklearn.log_model(clf, MODEL_NAME)
    
    model_uri = mlflow.get_artifact_uri(MODEL_NAME)
    
    # MLflow: save model on MLflow Model Registry
    #mv = mlflow.register_model(model_uri, MODEL_NAME)


Active run_id: 1cf610fd1a88486e98101a50c62f0afc
CV accuracy: 0.78, Test accuracy: 0.76
CV f1: 0.78, Test f1: 0.61


In [12]:
model_uri

's3://datapath-mlops-l3/1/1cf610fd1a88486e98101a50c62f0afc/artifacts/dev_logreg_model'

In [13]:
mv = mlflow.register_model(model_uri, MODEL_NAME)

Successfully registered model 'dev_logreg_model'.
2023/08/01 04:36:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: dev_logreg_model, version 1
Created version '1' of model 'dev_logreg_model'.


In [14]:
# Save an image as artifact
RocCurveDisplay.from_predictions(
    y_test,
    y_test_pred,
    name="ROC Curve",
    color="darkorange",
)

plt.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

plt.savefig("ROCcurve.png")
mlflow.log_artifact("ROCcurve.png")
plt.close()

## Load Model Registry

In [15]:
# Load a model and predict
# The following command should be a comment if run by docker-compose
# MODEL_NAME = "testing"
version = "latest"
my_clf = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}/{version}")

In [16]:
print(my_clf.predict(X_test))

[0 1 1 ... 0 0 0]


In [17]:
import os

In [18]:
if not os.path.exists('./data'):
    raise('No existe la ruta')

TypeError: exceptions must derive from BaseException