21/3/2021 Modified adding callbacks

In [None]:
!pip install mlflow >> /dev/null

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

import torch
import pytorch_tabnet

from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.callbacks import Callback

import mlflow

from sklearn.preprocessing import LabelEncoder

import ads
import oci
import os
import hashlib

from ads.dataset.factory import DatasetFactory

from utils import get_hash_from_catalog

In [2]:
# setting security: using Resource Principal
ads.set_auth(auth='resource_principal')

In [3]:
# getting information from Data Catalog
FILE_NAME = "cs-test.csv"

# md5 read from Catalog
md5_cat = get_hash_from_catalog(FILE_NAME)

print('File hash from Data Catalog is:', md5_cat)

File hash from Data Catalog is: credit_scoring/tabnet2_optuna


In [None]:
# get dataset from object storage, check MD5 hash
BUCKET_NAME = "credit_scoring"
TMP_FILE = 'temp.csv'

ds = DatasetFactory.open(f"ocis://{BUCKET_NAME}/{FILE_NAME}")

print('The dataset contains:', ds.shape[0], 'records')

# dump to a tmp file
ds.to_csv('temp.csv', index=None)

md5_computed = hashlib.md5(open(TMP_FILE,'rb').read()).hexdigest()

os.remove(TMP_FILE)

# MD5 hash expected see above
print()
print('MD5 hash of the file is: ', md5_computed)

# check with assertion
assert (md5_computed == md5_cat)
print("MD5 hash check OK")

In [None]:
# moving to a pandas dataframe
df_orig = ds.to_pandas_dataframe()

In [None]:
df_orig.head()

In [None]:
df_orig.columns

In [None]:
data = df_orig.copy()

In [None]:
# columns that will be used
PREDICTOR = 'SeriousDlqin2yrs'

unused_feat = ['id']

num_col_list = ['RevolvingUtilizationOfUnsecuredLines','DebtRatio', 'MonthlyIncome']

cat_col_list = ['age', 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfOpenCreditLinesAndLoans', 
                'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
               'NumberOfDependents']

all_col_list = num_col_list + cat_col_list

In [None]:
# encode cat features (with label encoder)
nunique = data.nunique()
types = data.dtypes

categorical_columns = cat_col_list
categorical_dims =  {}

# I need to save the encoder list for the processing of the test set
enc_list = []
 
for col in cat_col_list:
    print(col, data[col].nunique())
    l_enc = LabelEncoder()
    data[col] = l_enc.fit_transform(data[col].values)
    
    # save the encoder for the test set
    enc_list.append(l_enc)
    categorical_dims[col] = len(l_enc.classes_)

In [None]:
# split data in train, validation
FRAC = 0.8

N_TRAIN = int(data.shape[0] * FRAC)
N_VALID = data.shape[0] - N_TRAIN

# before splitting, shuffle
data = data.sample(frac = 1)

df_train = data[:N_TRAIN]
df_valid = data[N_TRAIN:]

print('Number of records in train dataset:', N_TRAIN)
print('Number of records in validation dataset:', N_VALID)

In [None]:
label_train = df_train[PREDICTOR].values
label_valid = df_valid[PREDICTOR].values

df_train = df_train[all_col_list]
df_valid = df_valid[all_col_list]

In [None]:
features = [ col for col in df_train.columns if col not in unused_feat+[PREDICTOR]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
print('Index of cat colums:', cat_idxs)

In [None]:
print('Cardinality of cat columns:', cat_dims)

In [None]:
# MLFLOW, configuration, for the callback

MLF_TRACK_URI = 'http://130.61.20.111:5000'
MLF_EXP_NAME = 'tabnet66gpu'
MLF_RUN_NAME = 'tabnet66gpu-1'

In [None]:
# parameters registered on MLFLOW
params = {
    "epochs" : 80,
    "batch_size" : 2048,
    "n_steps" : 1,
    "n_d" : 36,
    "cat_emb_dim" : 1
}

In [None]:
# callback for MLFlow integration
class MLCallback(Callback):
    def on_train_begin(self, logs=None):
        
        mlflow.set_tracking_uri(MLF_TRACK_URI)
        mlflow.set_experiment(MLF_EXP_NAME)
        
        print('Train begin...')
        mlflow.start_run(run_name = MLF_RUN_NAME)
        
        mlflow.log_params(params)
        
    def on_train_end(self, logs=None):
        
        mlflow.end_run()
        print('Train end...')
        
    def on_epoch_end(self, epoch, logs=None):
        
        # print(logs)
        loss = logs["loss"]
        val_auc = logs["valid_auc"]
        train_auc = logs['train_auc']
        
        # send to MLFlow
        mlflow.log_metric("train_auc", train_auc)
        mlflow.log_metric("valid_auc", val_auc)
        mlflow.log_metric("loss", loss)

In [None]:
# 
EPOCHS = params['epochs']
BATCH_SIZE = params['batch_size']
N_STEPS = params['n_steps']
CAT_EMB_DIM = params['cat_emb_dim']
N_D = params['n_d']
N_A = N_D

# callback for MLFlow integration
mlcbck = MLCallback()

clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=CAT_EMB_DIM,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":30, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax',
                       n_steps = N_STEPS,
                       n_d = N_D,
                       n_a = N_A
                      )

clf.fit(df_train.values, label_train,
        eval_set=[(df_train.values, label_train),(df_valid.values, label_valid)],
        max_epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        patience = 15,
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
        callbacks = [mlcbck]
)

In [None]:
# plot auc
plt.figure(figsize=(10,6))
plt.title('AUC')
plt.plot(clf.history['train_auc'], label='Training AUC')
plt.plot(clf.history['valid_auc'], label='Validation AUC')
plt.legend(loc='lower right')
plt.ylabel('AUC')
plt.xlabel('epoch')
plt.grid(True);
plt.show();

### Model interpretability

In [None]:
# let's look at feature importance
# plt.bar(x= range(len(clf.feature_importances_)), height=clf.feature_importances_, );
plt.figure(figsize=(10,6))
plt.xticks(rotation=90)
plt.title('Feature importance')
plt.bar(x= all_col_list, height=clf.feature_importances_, );
plt.grid(True);