**1. Start Local MLflow tracking server:** <br> `mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns`

The MLflow model registry does not work unless your metadata is stored in a SQL database

extra parameters: <br>
`--host 0.0.0.0 -p 5000 --gunicorn-opts "--timeout 180"`

`backend-store-uri` represents the location and type of database we want to use to store high level metadata associated with our runs. <br> `default-artifact-root` specifies a separate path where artifacts should be stored. A separate path is provided for artifacts because artifacts can be very large and therefore may need to be stored in a cloud-based data store such as S3 for some projects. 

**2. Visit MLflow UI url:** <br> `http://127.0.0.1:5000` <br>
There, only the Default experiment will exist.

**3. Set Tracking URI:** <br>
`mlflow.set_tracking_uri('http://127.0.0.1:5000')` <br>
A very important step to tell MLflow where the model tracking server is.

**4. Create experiment or use existing one**: <br>
`mlflow.set_experiment(_experiment_name)`

**5. Track things:** <br>
E.g. `mlflow.log_metrics(metrics)`

**6. Save model on MLflow Model Registry:** <br>
`model_uri = mlflow.get_artifact_uri("logistic_regression_model")` <br>
`MODEL_NAME = "logistic_regression_model"` <br>
`mv = mlflow.register_model(model_uri, MODEL_NAME)`

**7. Load a model and predict:** <br>
`mlflow.set_tracking_uri('http://127.0.0.1:5000')` <br>
`_model_name = "logistic_regression_model"` <br>
`version = "4"` <br>
`my_clf = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}/{version}")`

In [10]:
!pip install -U pyopenssl cryptography -q

[0m

In [1]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

from sklearn.metrics import (
    accuracy_score,
    f1_score,
)

from feature_engine.encoding import (
    OrdinalEncoder,
    OneHotEncoder,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 25)


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
data = pd.read_csv("../data/input_data/telco_customer_churn_1.csv")
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,2193-SFWQW,Male,0,Yes,Yes,72,Yes,No,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Bank transfer (automatic),111.95,8033.1,No
5996,5656-JAMLX,Male,0,No,No,62,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),19.85,1253.65,No
5997,3462-BJQQA,Female,0,No,No,6,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Electronic check,89.75,552.65,No
5998,0442-TDYUO,Male,0,Yes,No,48,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Mailed check,20.05,1036,No


In [3]:
# replace NaNs of TotalCharges with '-1' and covert col to float (from string)
data['TotalCharges'] = data['TotalCharges'].str.replace(' ', '-1').astype(float)

# Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['customerID', 'Churn'], axis=1),
    data['Churn'],
    test_size=0.2,
    random_state=0,
)

# Categorical Features

In [5]:
cat_vars_onehot = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_vars_ordinal_arbitrary = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)
ordinal_encoder_arbitrary.fit(X_train, y_train)

onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)
onehot_encoder.fit(X_train)

X_train = ordinal_encoder_arbitrary.transform(X_train)
X_test = ordinal_encoder_arbitrary.transform(X_test)

X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# Numerical Features

In [6]:
num_vars_yeo_johnson = ['TotalCharges']

yeo_transformer = YeoJohnsonTransformer(variables=num_vars_yeo_johnson)

X_train = yeo_transformer.fit_transform(X_train)
X_test = yeo_transformer.transform(X_test)

# Target

In [7]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Scaling

In [8]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train) 

X_train = pd.DataFrame(min_max_scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(min_max_scaler.transform(X_test), columns=X_train.columns)

# ML

In [11]:
# MLflow: tell MLflow where the model tracking server is
mlflow.set_tracking_uri('http://host.docker.internal:5001')

# MLflow: experiment name
_experiment_name = "churn-prediction"
mlflow.set_experiment(_experiment_name)

with mlflow.start_run() as run:

    # MLflow: print run specific info
    print(f"\nActive run_id: {run.info.run_id}")

    # Choose parameters
    param_C = 0.8
    param_max_iter = 200
    clf = LogisticRegression(C=param_C, max_iter=param_max_iter, random_state=0)

    # Train on all set and evaluate on test
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    # Cross validation on train set
    cv_accuracy = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro').mean()

    print(f"CV accuracy: {cv_accuracy:.2f}, Test accuracy: {test_accuracy:.2f}\n"
          f"CV f1: {cv_f1:.2f}, Test f1: {test_f1:.2f}")

    # MLflow: Log the metrics
    metrics = {"cv_accuracy": cv_accuracy, "cv_f1": cv_f1, "test_accuracy": test_accuracy, "test_f1": test_f1}
    mlflow.log_metrics(metrics)

    # MLflow: Log the parameters
    params = {"C": param_C, "max_iter": param_max_iter}
    mlflow.log_params(params)

    # MLflow log the model
    mlflow.sklearn.log_model(clf, "logistic_regression_model")

    # MLflow: save model on MLflow Model Registry
    model_uri = mlflow.get_artifact_uri("logistic_regression_model")
    MODEL_NAME = "logistic_regression_model"
    mv = mlflow.register_model(model_uri, MODEL_NAME)


Active run_id: 64819e5e136c403698a8ee48a37f8991
CV accuracy: 0.80, Test accuracy: 0.79
CV f1: 0.72, Test f1: 0.56


Registered model 'logistic_regression_model' already exists. Creating a new version of this model...
2023/11/30 09:52:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression_model, version 3
Created version '3' of model 'logistic_regression_model'.


In [12]:
# Load a model and predict
mlflow.set_tracking_uri('http://host.docker.internal:5001')
_model_name = "logistic_regression_model"
version = "1"
my_clf = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}/{version}")
print(my_clf.predict(X_test))

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[0 0 1 ... 0 0 0]
