## Explore Errors Mitigations

In [5]:
import sys
sys.path.append('../../../ResponsibleAIToolBox-Mitigation/')
from errorsmitigation.dataprocessing import DataSplit, DataTransformer, DataRebalance, DataSample

%matplotlib inline
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import cm

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, scale, MinMaxScaler, PowerTransformer


from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### Load dataset

In [6]:
data_dir = '../../datasets/hr_promotion'
dataset =  pd.read_csv(data_dir + '/train.csv')

seed = 42
dataset.shape
dataset.head()

(54808, 14)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


### Random Sample

In [7]:
dataset_target = dataset.columns.get_loc('is_promoted')

data_sample =  DataSample(dataset, dataset_target, 0.8, False, False, False, True)
random_sample = data_sample.RandomSample()

random_sample.shape
random_sample.head()

(43846, 14)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
15839,37222,Sales & Marketing,region_32,Bachelor's,m,other,1,42,3.0,4,0,0,45,0
22951,36465,Technology,region_16,Bachelor's,m,sourcing,1,26,3.0,3,0,0,78,0
40696,42119,Technology,region_7,Bachelor's,m,other,1,29,3.329256,1,0,0,85,0
12167,8727,Analytics,region_29,Bachelor's,m,other,1,26,3.0,4,0,0,85,0
30247,41572,Sales & Marketing,region_34,Bachelor's,m,other,1,32,3.0,4,0,0,50,0


### Split Dataset 

In [8]:
# dataset - Panda Data Frame. 
# target – A string representing the name of the label column, or the label column integer index (zero base)
# train_size – The training data split size.  The default is 0.9, which split the dataset to 90% training and 10% testing. 
    # Training and Test split values add up to 1. 
# random_state – Control the randomization of the algorithm. 
    # ‘None’: the random number generator is the RandomState instance used by np.random.  
# categorical_features – A Boolean flag to indicates the presence of categorical features. Default is True.  
# drop_null: If flag is set to True, records with null values are dropped, otherwise they are replaced by the mean.
    # Default is True.
# drop_duplicates: if flag is set to True, duplicate records are dropped. Default is False.
# Stratify: If not None, data is split in a stratified fashion, using this as the class labels. Default is False.

random_sample_target = random_sample.columns.get_loc('is_promoted')
data_split =  DataSplit(random_sample, random_sample_target, 0.9, seed, True, False, False, True)

train_data, test_data = data_split.Split()


print(train_data.shape)
print(test_data.shape) 

train_data.head()

(39461, 65)
(4385, 65)


Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_Analytics,...,education_Below Secondary,education_Master's & above,education_nan,gender_f,gender_m,gender_nan,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing,recruitment_channel_nan
10003,55305,1,29,1.0,2,1,0,62,0,0,...,0,0,0,0,1,0,0,0,1,0
24612,14432,1,37,5.0,10,1,0,60,0,0,...,0,0,0,0,1,0,1,0,0,0
54547,74825,1,26,3.0,5,0,0,60,0,0,...,0,0,0,1,0,0,0,0,1,0
3576,60565,1,38,5.0,9,0,0,83,1,0,...,0,1,0,0,1,0,0,0,1,0
30393,54030,1,38,2.0,9,0,0,77,0,0,...,0,1,0,1,0,0,1,0,0,0


### Transform Dataset

In [9]:
# Data Transformation

# dataset,
# target, 
# transformer_type,
# transform_features = None,
# random_state = None,
# method ='yeo-johnson',
# output_distribution  = 'uniform' 
# transform_features = None


# StandardScaler = 1
# MinMaxScaler = 2
# RobustScaler = 3
# PowerTransformer = 4
# QuantileTransformer = 5
# Normalizer = 6

train_data_label = train_data.columns.get_loc('is_promoted')
# train_data.iloc[:,target_index]

dt_train =  DataTransformer(train_data, train_data_label,DataTransformer.TransformerType.StandardScaler, None, seed)
train_data_t = dt_train.Transform()

    
dt_test =  DataTransformer(test_data, train_data_label,DataTransformer.TransformerType.StandardScaler, None, seed)
test_data_t = dt_test.Transform()

train_data.head()
train_data_t.head()


Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_Analytics,...,education_Below Secondary,education_Master's & above,education_nan,gender_f,gender_m,gender_nan,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing,recruitment_channel_nan
10003,55305,1,29,1.0,2,1,0,62,0,0,...,0,0,0,0,1,0,0,0,1,0
24612,14432,1,37,5.0,10,1,0,60,0,0,...,0,0,0,0,1,0,1,0,0,0
54547,74825,1,26,3.0,5,0,0,60,0,0,...,0,0,0,1,0,0,0,0,1,0
3576,60565,1,38,5.0,9,0,0,83,1,0,...,0,1,0,0,1,0,0,0,1,0
30393,54030,1,38,2.0,9,0,0,77,0,0,...,0,1,0,1,0,0,1,0,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0.0,0.707294,-0.415962,-0.754239,-1.919123,-0.901636,1.358028,-0.151553,-0.098335,-0.326766,...,-0.121279,-0.610335,-0.215934,-0.65401,0.65401,0.0,-1.118506,-0.145129,1.166479,0.0
1,0.0,-1.102848,-0.415962,0.289597,1.379982,0.966813,1.358028,-0.151553,-0.24837,-0.326766,...,-0.121279,-0.610335,-0.215934,-0.65401,0.65401,0.0,0.89405,-0.145129,-0.857281,0.0
2,0.0,1.571776,-0.415962,-1.145678,-0.26957,-0.200968,-0.736362,-0.151553,-0.24837,-0.326766,...,-0.121279,-0.610335,-0.215934,1.529029,-1.529029,0.0,-1.118506,-0.145129,1.166479,0.0
3,1.0,0.940244,-0.415962,0.420076,1.379982,0.733257,-0.736362,-0.151553,1.477024,-0.326766,...,-0.121279,1.638445,-0.215934,-0.65401,0.65401,0.0,-1.118506,-0.145129,1.166479,0.0
4,0.0,0.650828,-0.415962,0.420076,-1.094347,0.733257,-0.736362,-0.151553,1.026921,-0.326766,...,-0.121279,1.638445,-0.215934,1.529029,-1.529029,0.0,0.89405,-0.145129,-0.857281,0.0


### Accuracy Results

In [10]:
def split_label_index(dataset):
    x = dataset.drop([0], axis=1)
    y = dataset[0]
    return x, y

def split_label(dataset):
    x = dataset.drop(['is_promoted'], axis=1)
    y = dataset['is_promoted']
    return x, y
# def split_label(dataset):
#     x = dataset.drop(['is_promoted'], axis=1)
#     y = dataset['is_promoted']
#     return x, y

# splitting the training data
x_train, y_train = split_label_index(train_data_t)

# splitting the test data
x_test, y_test = split_label_index(test_data_t)

# LGBMClassifier Model
clf = LGBMClassifier(n_estimators=50)
model = clf.fit(x_train, y_train)

pred = model.predict(x_test)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])

print("number of errors on test dataset: " + str(sum(pred != y_test)))

conf_matrix(y_test,pred)

print(classification_report(y_test, pred)) 

number of errors on test dataset: 258


Unnamed: 0,Pred 1,Pred 0
True,TP = 126 (33.78%),FN = 247 (66.22%)
False,FP = 11 (0.27%),TN = 4001 (99.73%)


              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      4012
         1.0       0.92      0.34      0.49       373

    accuracy                           0.94      4385
   macro avg       0.93      0.67      0.73      4385
weighted avg       0.94      0.94      0.93      4385



### Mitigations (Nulls, Duplicates)

In [11]:
# Mitigation
# random_sample.iloc[:,random_sample_target]
def split_label(dataset):
    x = dataset.drop(['is_promoted'], axis=1)
    y = dataset['is_promoted']
    return x, y

def split_label_index(dataset):
    x = dataset.drop([0], axis=1)
    y = dataset[0]
    return x, y

data_split2 =  DataSplit(random_sample, random_sample.columns.get_loc('is_promoted'), 0.9, seed, True, True, True, True)
train_data2, test_data2 = data_split2.Split()


dt_train2 =  DataTransformer(train_data2, train_data2.columns.get_loc('is_promoted'),DataTransformer.TransformerType.StandardScaler, None, seed)
train_data_t2 = dt_train2.Transform()


dt_test2 =  DataTransformer(test_data2, test_data2.columns.get_loc('is_promoted'),DataTransformer.TransformerType.StandardScaler, None, seed)
test_data_t2 = dt_test2.Transform()


x_train2, y_train2 = split_label_index(train_data_t2)
x_test2, y_test2 = split_label_index(test_data_t2)


clf2 = LGBMClassifier(n_estimators=50)
model2 = clf2.fit(x_train2, y_train2)
pred2 = model2.predict(x_test2)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])

print("number of errors on test dataset: " + str(sum(pred2 != y_test2)))
conf_matrix(y_test2,pred2)
print(classification_report(y_test2, pred2)) 

number of errors on test dataset: 240


Unnamed: 0,Pred 1,Pred 0
True,TP = 130 (35.71%),FN = 234 (64.29%)
False,FP = 6 (0.16%),TN = 3820 (99.84%)


              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      3826
         1.0       0.96      0.36      0.52       364

    accuracy                           0.94      4190
   macro avg       0.95      0.68      0.74      4190
weighted avg       0.94      0.94      0.93      4190



### Compare Results

In [12]:
# compare results before and after removing nulls

from sklearn.metrics import roc_auc_score

((tn2, fp2), (fn2, tp2)) = metrics.confusion_matrix(y_test2, pred2)
precision2 = round(tp2 /(tp2 + fp2), 5)
recall2 = round(tp2 /(tp2 + fn2), 5)

((tn, fp), (fn, tp)) = metrics.confusion_matrix(y_test, pred)
precision = round(tp /(tp + fp),5)
recall = round(tp /(tp + fn), 5)

preda = model.predict_proba(x_test)[:, 1]
roc_auc = round(roc_auc_score(y_test, preda),5)

preda2 = model2.predict_proba(x_test2)[:, 1]
roc_auc2 = round(roc_auc_score(y_test2, preda2),5)

               
def compare_results():
    return pd.DataFrame([[f'{precision}', f'{recall}', f'{roc_auc}'], 
                         [f'{precision2}', f'{recall2}', f'{roc_auc2}']],
                        columns=['Precision', 'Recall', 'roc_auc'], 
                        index=['No Mitigation', 'With Mitigation'])


compare_results()

Unnamed: 0,Precision,Recall,roc_auc
No Mitigation,0.91971,0.3378,0.90407
With Mitigation,0.95588,0.35714,0.91382


### Mitigation (Rebalance Dataset)

In [13]:
# Rebalance data

tomek = TomekLinks(sampling_strategy='auto')    
smote = SMOTE(sampling_strategy='auto', random_state= seed)    
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=seed)
# dataset - A Panda Data Frame representing the data to rebalance.   

# target – A string representing the name or the label column integer index (zero base) 
    # of the target feature to use as the classes for rebalancing the data.
    
# sampling_strategy  
    # 'minority': resample only the minority class. 
    # 'not minority': resample all classes but the minority class. 
    # 'not majority': resample all classes but the majority class. 
    # 'all': resample all classes. 
    # 'auto': equivalent to 'not majority'. 
    
# random_state - Control the randomization of the algorithm. 
    # ‘None’: the random number generator is the RandomState instance used by np.random.  
    # ‘If Int’: random_state is the seed used by the random number generator. 
    
# smote_tomek - The SMOTETomek object to use. If not given by Caller, a SMOTE object with default parameters will be given. 
    #  imblearn.combine.SMOTETomek 
    
# smote - The SMOTE object to use. If not given by Caller, a SMOTE object with default parameters will be given. 
    # imblearn_over_sampling.SMOTE
    
# tomek - The TomekLinks object to use. If not given by Caller, a TomekLinks object with sampling strategy=’all’ will be given.  imblearn.under_sampling.TomekLinks


train_data_rebalance3 =  DataRebalance(train_data2, train_data2.columns.get_loc('is_promoted'), 'auto', seed, None, smote)
train_data_r = train_data_rebalance3.Rebalance()

# test_data_rebalance3 =  DataRebalance(test_data2, test_data2.columns.get_loc('is_promoted'), 'auto', seed, None, smote)
# test_data_r = test_data_rebalance3.Rebalance()

x_train3, y_train3 = split_label(train_data_r)
x_test3, y_test3 = split_label(test_data2)

train_data2.shape
train_data_r.shape


(37702, 65)

(68852, 65)

### Accuracy Results

In [14]:

clf3 = LGBMClassifier(n_estimators=50)
model3 = clf3.fit(x_train3, y_train3)
pred3 = model3.predict(x_test3)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])

print("number of errors on test dataset: " + str(sum(pred3 != y_test3)))
conf_matrix(y_test3,pred3)
print(classification_report(y_test3, pred3)) 


number of errors on test dataset: 242


Unnamed: 0,Pred 1,Pred 0
True,TP = 131 (35.99%),FN = 233 (64.01%)
False,FP = 9 (0.24%),TN = 3817 (99.76%)


              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3826
           1       0.94      0.36      0.52       364

    accuracy                           0.94      4190
   macro avg       0.94      0.68      0.74      4190
weighted avg       0.94      0.94      0.93      4190



### Compare Results

In [15]:
# compare results before and after removing nulls

from sklearn.metrics import roc_auc_score

((tn, fp), (fn, tp)) = metrics.confusion_matrix(y_test2, pred2)
precision2 = round(tp /(tp + fp), 5)
recall2 = round(tp2 /(tp2 + fn2), 5)


((tn3, fp3), (fn3, tp3)) = metrics.confusion_matrix(y_test3, pred3)
precision3 = round(tp3 /(tp3 + fp3),5)
recall3 = round(tp3 /(tp3 + fn3), 5)


preda3 = model3.predict_proba(x_test3)[:, 1]
roc_auc3 = round(roc_auc_score(y_test3, preda3),5)

preda2 = model2.predict_proba(x_test2)[:, 1]
roc_auc2 = round(roc_auc_score(y_test2, preda2),5)

               
def compare_results():
    return pd.DataFrame([[f'{precision}', f'{recall}', f'{roc_auc}'], 
                         [f'{precision2}', f'{recall2}', f'{roc_auc2}'],
                         [f'{precision3}', f'{recall3}', f'{roc_auc3}']],
                        columns=['Precision', 'Recall', 'roc_auc'], 
                        index=['No Mitigation', 'With Mitigation', 'With Mitigation & Rebalance'])


compare_results()

Unnamed: 0,Precision,Recall,roc_auc
No Mitigation,0.91971,0.3378,0.90407
With Mitigation,0.95588,0.35714,0.91382
With Mitigation & Rebalance,0.93571,0.35989,0.90248
