### Employee Attrition using **CatBoost**

* L.S, ottobre 2021
* gestione delle **feature categoriche**
* importanza del **Learning rate**
* integrazione con **MLFLow**
* salvataggio del modello nel **Model Catalog**
* uso di **Optuna**

In [1]:
import pandas as pd
import numpy as np

# for reading data from Object Storage
import ocifs
from ads import set_auth

# usero' catboost
import catboost as cat

# per la confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

# grafici
import matplotlib.pyplot as plt
import seaborn as sns

# yes, we have optuna
import optuna

In [2]:
# this way we enable access to Object Storage
set_auth(auth='resource_principal')

In [3]:
# utility function and globals
# GLOBALS
FIGSIZE = (9, 6)

SEED = 42

# additional print
DEBUG = 0

#
# easy plot of the confusion matrix
#
def plot_cm(model, x_test, y_test):
    y_pred_labels = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot();
    
def compute_auc(model, x_test, y_test):
    y_pred = model.predict_proba(x_test)
    y_pred = y_pred[:, 1]
    auc = round(roc_auc_score(y_test, y_pred), 4)
    
    return auc

def compute_prec_rec(model, x_test, y_test):
    y_pred_labels = model.predict(x_test)
    
    rec = round(recall_score(y_test, y_pred_labels), 4)
    prec = round(precision_score(y_test, y_pred_labels), 4)
    
    return prec, rec

def compute_accuracy(model, x_test, y_test):
    y_pred_labels = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred_labels)
    
    return round(acc, 3)

def set_as_categorical(dataf, cat_columns):
    for c in cat_columns:
        if DEBUG:
            print('Setting:', c)
        dataf[c] = dataf[c].astype('category')
            
    return dataf

#
# my split in train, test set
#
def my_train_test_split(df, frac):
    # frac: the fraction used for train
    # df: the dataframe
    
    # shuffle before split
    df = df.sample(frac=1., random_state=SEED)

    # FRAC = 0.90
    tot_rec = df.shape[0]
    NUM_TRAIN = int(frac*tot_rec)
    NUM_TEST = tot_rec - NUM_TRAIN

    data_train = df[:NUM_TRAIN]
    data_test = df[NUM_TRAIN:]

    print()
    print('Numero totale di campioni:', tot_rec)
    print('Numero di campioni nel TRAIN SET:', data_train.shape[0])
    print('Numero di campioni nel TEST SET:', data_test.shape[0])
    
    return data_train, data_test

# read the csv from Object storage and return a pandas df
def read_from_object_storage(prefix, file_name):
    # get access to OSS as an fs
    # config={} assume resource_principal auth
    fs = ocifs.OCIFileSystem(config={})
    
    FILE_PATH = prefix + file_name
    
    # reading data from Object Storage
    with fs.open(FILE_PATH, 'rb') as f:
        df = pd.read_csv(f)
    
    return df

In [4]:
# input data from file in Object Storage
PREFIX = "oci://data_input@fr95jjtqbdhh/"
FILE_NAME = "orcl_attrition.csv"

# see in functions above
data_orig = read_from_object_storage(prefix=PREFIX, file_name=FILE_NAME)

# some columns are not needed. This is the list of columns that will be used
my_columns = ['Age', 'Attrition', 'EnvironmentSatisfaction', 'MaritalStatus', 'TravelForWork', 'SalaryLevel', 'JobFunction', 'CommuteLength', 'EducationalLevel', 'EducationField', 'MonthlyIncome', 
              'OverTime', 'StockOptionLevel', 'TrainingTimesLastYear', 'YearsSinceLastPromotion', 'WorkLifeBalance']

# dataset filtrato eliminando le colonne non necessarie
data = data_orig[my_columns]

data.head()

Unnamed: 0,Age,Attrition,EnvironmentSatisfaction,MaritalStatus,TravelForWork,SalaryLevel,JobFunction,CommuteLength,EducationalLevel,EducationField,MonthlyIncome,OverTime,StockOptionLevel,TrainingTimesLastYear,YearsSinceLastPromotion,WorkLifeBalance
0,42,Yes,2,Single,infrequent,5054,Product Management,2,L2,Life Sciences,5993,Yes,0,0,0,1
1,50,No,3,Married,often,1278,Software Developer,9,L1,Life Sciences,5130,No,1,3,1,3
2,38,Yes,4,Single,infrequent,6296,Software Developer,3,L2,Other,2090,Yes,0,3,0,3
3,34,No,4,Married,often,6384,Software Developer,4,L4,Life Sciences,2909,Yes,0,3,3,3
4,28,No,1,Married,infrequent,2710,Software Developer,3,L1,Medical,3468,No,1,3,2,3


In [5]:
# tipologie di features e colonne
TARGET = 'Attrition'

# automatizziamo !!!
all_columns = sorted(data.columns)
features = sorted(list(set(all_columns) - set([TARGET])))

# per decidere, guarda statistiche dal Notebook 1
cat_columns = sorted(['Age', 'CommuteLength','EnvironmentSatisfaction','MaritalStatus', 'TravelForWork', 'JobFunction', 
                      'EducationalLevel', 'EducationField', 'OverTime', 
                      'StockOptionLevel', 'TrainingTimesLastYear',
                      'YearsSinceLastPromotion', 'WorkLifeBalance'])



# colonne numeriche, continue (tutte le altre)
num_columns = sorted(list(set(all_columns) - set(cat_columns) - set([TARGET])))

print('Colonna Target:', TARGET)
print()
print('Tutte le features:', features, len(features))
print()
print('Colonne categorical:', cat_columns, len(cat_columns))
print()
print('Colonne numeriche:', num_columns, len(num_columns))


# split TRAIN, TEST
# shuffle prima dello split TRAIN, TEST
FRAC = 0.90

data_train, data_test = my_train_test_split(data, frac=FRAC)

Colonna Target: Attrition

Tutte le features: ['Age', 'CommuteLength', 'EducationField', 'EducationalLevel', 'EnvironmentSatisfaction', 'JobFunction', 'MaritalStatus', 'MonthlyIncome', 'OverTime', 'SalaryLevel', 'StockOptionLevel', 'TrainingTimesLastYear', 'TravelForWork', 'WorkLifeBalance', 'YearsSinceLastPromotion'] 15

Colonne categorical: ['Age', 'CommuteLength', 'EducationField', 'EducationalLevel', 'EnvironmentSatisfaction', 'JobFunction', 'MaritalStatus', 'OverTime', 'StockOptionLevel', 'TrainingTimesLastYear', 'TravelForWork', 'WorkLifeBalance', 'YearsSinceLastPromotion'] 13

Colonne numeriche: ['MonthlyIncome', 'SalaryLevel'] 2

Numero totale di campioni: 1470
Numero di campioni nel TRAIN SET: 1323
Numero di campioni nel TEST SET: 147


In [6]:
# separo X ed y
x_train = data_train[features]
y_train = data_train[TARGET]

x_test = data_test[features]
y_test = data_test[TARGET]

# encode labels as 0, 1
le = LabelEncoder()

# fit the encoder
le.fit(y_train.values)

# encode train and test
y_train = le.transform(y_train.values)
y_test = le.transform(y_test.values)

# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(x_train.columns) if col in cat_columns]

### Training

In [7]:
%%time

# to address data imbalance
class_weights = dict({0:1, 1:5.5})

params = {'iterations':1500,
          'learning_rate':0.005,
          'depth':10,
          'class_weights':class_weights,
          'use_best_model': True,
          'l2_leaf_reg': 50
         }

model = cat.CatBoostClassifier()
model.set_params(**params)

model.fit(x_train, y_train, cat_columns_idxs, verbose=False, early_stopping_rounds=100, eval_set=[(x_test, y_test)])

CPU times: user 4min 3s, sys: 52.4 s, total: 4min 56s
Wall time: 17.9 s


<catboost.core.CatBoostClassifier at 0x7f3b1bceb940>

In [8]:
auc = compute_auc(model, x_train, y_train)

print('AUC computed on the train set is:', auc)

auc = compute_auc(model, x_test, y_test)

print('AUC computed on the test set is:', auc)

AUC computed on the train set is: 0.9256
AUC computed on the test set is: 0.8133


### Evaluate on test set

In [None]:
plot_cm(model, x_test, y_test)

In [None]:
# c'è un certo overfitting...
plot_cm(model, x_train, y_train)

In [None]:
# calcolo prec, rec
prec, rec = compute_prec_rec(model, x_test, y_test)

print('precision and recall computed on the test set are:', 'prec:', prec, 'rec:', rec)

# accuracy
acc = compute_accuracy(model, x_test, y_test)

print('Accuracy on test set is:', acc)

### Facciamo una sessione di ottimizzazione con optuna, variando l2_leaf_reg

In [None]:
list_l2 = [1e-1, 1, 3, 5, 7, 1e1, 3e1, 5e1, 1e2, 3e2, 5e2, 1e3]

def objective(trial):
    class_weights = dict({0:1, 1:5.5})

    params = {'iterations':1500,
          'learning_rate':0.005,
          'depth': trial.suggest_int('depth', 6, 14, step=1),
          'class_weights':class_weights,
          'use_best_model': True,
          'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-1, 1e4, log=True)
         }
    
    model = cat.CatBoostClassifier()
    model.set_params(**params)
    
    model.fit(x_train, y_train, cat_columns_idxs, verbose=False, early_stopping_rounds=100, eval_set=[(x_test, y_test)])
    
    auc = compute_auc(model, x_test, y_test)
    
    return auc

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
study.optimize(objective, n_trials=100)

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

plot_optimization_history(study)

In [None]:
plot_contour(study)

In [None]:
study.best_params

In [None]:
%%time

# to address data imbalance
class_weights = dict({0:1, 1:5.5})

# get the best params from Optuna trials
params = {'iterations':1500,
          'learning_rate':0.005,
          'depth':study.best_params['depth'],
          'class_weights':class_weights,
          'use_best_model': True,
          'l2_leaf_reg':study.best_params['l2_leaf_reg'] 
         }

model = cat.CatBoostClassifier()
model.set_params(**params)

model.fit(x_train, y_train, cat_columns_idxs, verbose=False, early_stopping_rounds=100, eval_set=[(x_test, y_test)])

In [None]:
plot_cm(model, x_test, y_test)

In [None]:
# ok we see we have greatly improved with TP (from 15 -> 18, +20%)

In [None]:
plot_cm(model, x_train, y_train)

In [None]:
# sono riuscito a mitigare l'overfitting, ottenendo un buon risultato sui positivi del test set