### This NB shows integration with MLFlow deployed in OCI

In [1]:
import os
import pandas as pd
import lightgbm as lgb
import mlflow

from sklearn.metrics import classification_report
from sklearn.metrics import (get_scorer, make_scorer, f1_score, roc_auc_score, accuracy_score, 
                             confusion_matrix, ConfusionMatrixDisplay)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from utils import (cat_cols_selector,
                  num_cols_selector,
                  plot_cm,
                  evaluate_metrics)

#### set config for MLFlow

In [2]:
from config import (MLFLOW_TRACKING_URI, MLFLOW_TRACKING_USERNAME,
                    MLFLOW_TRACKING_PASSWORD)

os.environ['MLFLOW_TRACKING_URI'] = MLFLOW_TRACKING_URI
os.environ['MLFLOW_TRACKING_USERNAME'] = MLFLOW_TRACKING_USERNAME
os.environ['MLFLOW_TRACKING_PASSWORD'] = MLFLOW_TRACKING_PASSWORD

#### train a model

In [3]:
NAMESPACE = "frqap2zhtzbe"
BUCKET = "WORKSHOP"
NOME_FILE = "customer_churn_data.csv"

URL = f"oci://{BUCKET}@{NAMESPACE}/{NOME_FILE}"

orig_df = pd.read_csv(URL)

orig_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# get rid of some columns
cols_to_drop = ['customerID', 'TotalCharges']

df = orig_df.drop(columns=cols_to_drop)

TARGET = "Churn"
features = list(set(df.columns) - set([TARGET]))

In [5]:
# identify categorical and continuous
cat_cols = cat_cols_selector(df, TARGET)
num_cols = num_cols_selector(df, TARGET)

In [6]:
# code categorical as integer for LightGBM
enc = OrdinalEncoder()
enc.fit(df[cat_cols])

df[cat_cols] = enc.transform(df[cat_cols])

for col in cat_cols:
    df[col] = df[col].astype("category")

In [7]:
# train, test split (lo faccio direttamente sui dataframe)
SEED = 42

TEST_FRAC = 0.2

df_train, df_test = train_test_split(df, shuffle=True, test_size=TEST_FRAC, 
                                     random_state = SEED)

X_train, y_train = df_train.drop([TARGET], axis=1), df_train[TARGET]
X_test, y_test = df_test.drop([TARGET], axis=1), df_test[TARGET]

print()
print("Summary on train/test dataset:")
print()
print("# of samples in train set", df_train.shape[0])
print("# of samples in test set", df_test.shape[0])
print()
print(f'Numerical columns: {num_cols} ({len(num_cols)})')
print()
print(f'Categorical columns: {cat_cols} ({len(cat_cols)})')


Summary on train/test dataset:

# of samples in train set 5634
# of samples in test set 1409

Numerical columns: ['tenure', 'MonthlyCharges'] (2)

Categorical columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'] (16)


In [13]:
params = {'learning_rate': 0.055,
          'max_depth': 4,
          'n_estimators': 80,
          'num_leaves': 13}

model = lgb.LGBMClassifier(**params)

model.fit(X_train, y_train)

#### Evaluate on test set

In [16]:
metrics = evaluate_metrics(model, X_test, y_test)

Validation set result:
{'accuracy': 0.8126, 'roc_auc': 0.8619, 'recall': 0.5496}


### MLFlow

In [19]:
# test a first connection
EXP_NAME = "exp-workshop-marzo-002"
RUN_NAME = "run01"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
exp_id = mlflow.set_experiment(EXP_NAME)

# start the experiment
with mlflow.start_run(run_name=RUN_NAME):
    # log the hyper-parameters you're tracking
    mlflow.log_params(params)
    
    # log the metrics
    mlflow.log_metrics(metrics)
    
    # end the run
    mlflow.end_run()

2023/03/23 09:54:56 INFO mlflow.tracking.fluent: Experiment with name 'exp-workshop-marzo-002' does not exist. Creating a new experiment.
