In [46]:
import pandas as pd
import sys
sys.path.append('../../ResponsibleAIToolbox-Mitigation/')
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from errorsmitigation.dataprocessing import DataRebalance
from errorsmitigation.dataprocessing import DataSplit
from databalanceanalysis.databalanceanalysis.utils import undummify

from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [47]:
   
data_dir = '../datasets/hr_promotion'
df =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
cols_of_interest = ['education', 'gender']
label_col = 'is_promoted'
seed = 42
# handle duplicates
df = df.drop_duplicates()
df = df.dropna()

df.shape

(48607, 13)

In [48]:
## Train a model and get accuracy numbers

# data prep
def split_label(dataset):
    x = dataset.drop(['is_promoted'], axis=1)
    y = dataset['is_promoted']
    return x, y

dataset = pd.get_dummies(df, drop_first=False)
target_index = dataset.columns.get_loc('is_promoted')
data_split =  DataSplit(dataset,target_index , 0.9, 42, True, False, False, True)
train_data, test_data = data_split.Split()
# splitting the training data
x_train, y_train = split_label(train_data)
# splitting the test data
x_test, y_test = split_label(test_data)

# LGBMClassifier Model
clf = LGBMClassifier(n_estimators=50)
model = clf.fit(x_train, y_train)

pred = model.predict(x_test)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])

print("number of errors on test dataset: " + str(sum(pred != y_test)))

conf_matrix(y_test,pred)

print(classification_report(y_test, pred)) 


number of errors on test dataset: 293


Unnamed: 0,Pred 1,Pred 0
True,TP = 142 (33.57%),FN = 281 (66.43%)
False,FP = 12 (0.27%),TN = 4426 (99.73%)


              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4438
           1       0.92      0.34      0.49       423

    accuracy                           0.94      4861
   macro avg       0.93      0.67      0.73      4861
weighted avg       0.94      0.94      0.93      4861



In [49]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure

feature_measures = FeatureBalanceMeasure( cols_of_interest, label_col)

feat_measures1 = feature_measures.measures(df)
feat_measures1

Unnamed: 0,classA,classB,feature_name,Measures.DEMOGRAPHIC_PARITY,Measures.POINTWISE_MUTUAL_INFO,Measures.SD_COEF,Measures.JACCARD_INDEX,Measures.KR_CORRELATION,Measures.LOG_LIKELIHOOD,Measures.TTEST,Measures.TTEST_PVALUE
0,Master's & above,Bachelor's,education,0.016985,0.188441,0.004212,0.004921,4.952491,-0.624545,0.260262,0.409503
1,Master's & above,Below Secondary,education,0.020289,0.229618,0.069221,0.075568,-27.201913,3.704725,-2.63875,0.059302
2,Bachelor's,Below Secondary,education,0.003304,0.041177,0.065009,0.070647,-32.154404,4.32927,-2.899012,0.05062
0,f,m,gender,0.007242,0.081906,-0.003805,-0.004433,5.032009,-0.745009,0.264452,0.417706


In [50]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure

dist_measures = DistributionBalanceMeasure( cols_of_interest)
dist_measures1 = dist_measures.measures(df)
dist_measures1


f_obs
          education  count
0        Bachelor's  33355
1   Below Secondary    458
2  Master's & above  14794
f_obs
  gender  count
0      f  14791
1      m  33816


Unnamed: 0,feature_name,Measures.KL_DIVERGENCE,Measures.JS_DISTANCE,Measures.WS_DISTANCE,Measures.INF_NORM_DISTANCE,Measures.TOTAL_VARIANCE_DISTANCE,Measures.CHISQ_PVALUE,Measures.CHISQ
0,education,0.434208,0.358772,0.235256,0.352885,0.352885,0.0,33580.435452
1,gender,0.078685,0.141664,0.195702,0.195702,0.195702,0.0,7446.471187


In [51]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure

agg_measures = AggregateBalanceMeasure( cols_of_interest)
agg_measures1 = agg_measures.measures(df)
agg_measures1

Unnamed: 0,Measures.THEIL_L_INDEX,Measures.THEIL_T_INDEX,Measures.ATKINSON_INDEX
0,1.045845,0.513274,0.648605


In [52]:
smote_tomek = SMOTETomek()
smote = SMOTE()
tomek = TomekLinks()
dummy_df = pd.get_dummies(df, prefix_sep = "-")
dummy_df.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department-Analytics,department-Finance,...,region-region_8,region-region_9,education-Bachelor's,education-Below Secondary,education-Master's & above,gender-f,gender-m,recruitment_channel-other,recruitment_channel-referred,recruitment_channel-sourcing
0,1,35,5.0,8,1,0,49,0,0,0,...,0,0,0,0,1,1,0,0,0,1
1,1,30,5.0,4,0,0,60,0,0,0,...,0,0,1,0,0,0,1,1,0,0
2,1,34,3.0,7,0,0,50,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,2,39,1.0,10,0,0,50,0,0,0,...,0,0,1,0,0,0,1,1,0,0
4,1,45,3.0,2,0,0,73,0,0,0,...,0,0,1,0,0,0,1,1,0,0


In [53]:
gender_df = undummify(dummy_df, prefix_sep = "-", col = 'gender')
gender_df

Unnamed: 0,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department-Analytics,...,region-region_6,region-region_7,region-region_8,region-region_9,education-Bachelor's,education-Below Secondary,education-Master's & above,recruitment_channel-other,recruitment_channel-referred,recruitment_channel-sourcing
0,0,1,35,5.0,8,1,0,49,0,0,...,0,1,0,0,0,0,1,0,0,1
1,1,1,30,5.0,4,0,0,60,0,0,...,0,0,0,0,1,0,0,1,0,0
2,1,1,34,3.0,7,0,0,50,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1,2,39,1.0,10,0,0,50,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,1,45,3.0,2,0,0,73,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54802,1,2,31,1.0,2,0,0,49,0,0,...,0,0,0,0,1,0,0,1,0,0
54803,1,1,48,3.0,17,0,0,78,0,0,...,0,0,0,0,1,0,0,0,0,1
54804,0,1,37,2.0,6,0,0,56,0,0,...,0,0,0,0,0,0,1,1,0,0
54805,1,1,27,5.0,3,1,0,79,0,1,...,0,0,0,0,1,0,0,1,0,0


In [54]:
data_balance_smote =  DataRebalance(gender_df, 'gender', 'auto', 42, None, smote, None)

print(gender_df.shape)
smote_df = data_balance_smote.Rebalance()
print(smote_df.shape)
# smote_df
# print(smote_df.head)


(48607, 58)
(67632, 58)


In [55]:
# print(smote_df.shape)
# smote_df.head()
# dummy_df = pd.get_dummies(smote_df, prefix_sep ="-") # not required
# print(dummy_df.shape)
# dummy_df.head()

smote_df.head()
education_df = undummify(smote_df, prefix_sep = "-", col = 'education')
education_df.head()

education_df['education'].value_counts()


Unnamed: 0,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department-Analytics,...,region-region_6,region-region_7,region-region_8,region-region_9,education-Bachelor's,education-Below Secondary,education-Master's & above,recruitment_channel-other,recruitment_channel-referred,recruitment_channel-sourcing
0,0,1,35,5.0,8,1,0,49,0,0,...,0,1,0,0,0,0,1,0,0,1
1,1,1,30,5.0,4,0,0,60,0,0,...,0,0,0,0,1,0,0,1,0,0
2,1,1,34,3.0,7,0,0,50,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1,2,39,1.0,10,0,0,50,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,1,45,3.0,2,0,0,73,0,0,...,0,0,0,0,1,0,0,1,0,0


Unnamed: 0,education,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,...,region-region_34,region-region_4,region-region_5,region-region_6,region-region_7,region-region_8,region-region_9,recruitment_channel-other,recruitment_channel-referred,recruitment_channel-sourcing
0,0,0,1,35,5.0,8,1,0,49,0,...,0,0,0,0,1,0,0,0,0,1
1,1,1,1,30,5.0,4,0,0,60,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,1,34,3.0,7,0,0,50,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,2,39,1.0,10,0,0,50,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,1,45,3.0,2,0,0,73,0,...,0,0,0,0,0,0,0,1,0,0


1    47803
0    19253
2      576
Name: education, dtype: int64

In [56]:
data_balance_smote_2 =  DataRebalance(education_df, 'education', 'auto', 42, None, smote, None)

print(education_df.shape)
smote_df_2 = data_balance_smote_2.Rebalance()
print(smote_df_2.shape)

(67632, 56)
(143409, 56)


In [62]:

target_index = smote_df_2.columns.get_loc('is_promoted')
data_split =  DataSplit(smote_df_2,target_index , 0.9, 42, False, False, False, True)
train_data, test_data = data_split.Split()
# splitting the training data
x_train2, y_train2 = split_label(train_data)
# splitting the test data
x_test2, y_test2 = split_label(test_data)

# LGBMClassifier Model
clf2 = LGBMClassifier(n_estimators=50)
model2 = clf2.fit(x_train2, y_train2)

pred2 = model2.predict(x_test2)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])


In [100]:
# Compare Results
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


print('')
print(color.PURPLE + color.BOLD + "BEFORE: " + color.END + "number of test dataset instances: " + color.BOLD   + color.GREEN + str(len(y_test)) + color.END)
print("      : number of errors on test dataset: " + color.BOLD   + color.RED + str(sum(pred != y_test)) + color.END)
print('')
print(color.PURPLE + color.BOLD + "AFTER:  " + color.END + "number of test dataset instances: " + color.BOLD   + color.GREEN + str(len(y_test2)) + color.END)
print("     :  number of errors on test dataset: " + color.BOLD  + color.RED + str(sum(pred2 != y_test2)) + color.END)
print('')
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
print('')
print(color.BLUE + color.BOLD +"BEFORE: conf_matrix:" + color.END)
print("--------------------")
conf_matrix(y_test,pred) 
print('')
print(color.BLUE + color.BOLD +"AFTER: conf_matrix:" + color.END)
print("-------------------")
conf_matrix(y_test2,pred2)
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
print('')
print(color.YELLOW + color.BOLD +"BEFORE: classification_report:" + color.END)
print("--------------------------------")
print(classification_report(y_test, pred)) 
print(color.YELLOW + color.BOLD +"AFTER: classification_report:" + color.END)
print("--------------------------------")
print(classification_report(y_test2, pred2)) 
      


[95m[1mBEFORE: [0mnumber of test dataset instances: [1m[92m4861[0m
      : number of errors on test dataset: [1m[91m293[0m

[95m[1mAFTER:  [0mnumber of test dataset instances: [1m[92m14341[0m
     :  number of errors on test dataset: [1m[91m363[0m

-----------------------------------------------------------------------
-----------------------------------------------------------------------

[94m[1mBEFORE: conf_matrix:[0m
--------------------


Unnamed: 0,Pred 1,Pred 0
True,TP = 142 (33.57%),FN = 281 (66.43%)
False,FP = 12 (0.27%),TN = 4426 (99.73%)



[94m[1mAFTER: conf_matrix:[0m
-------------------


Unnamed: 0,Pred 1,Pred 0
True,TP = 288 (44.93%),FN = 353 (55.07%)
False,FP = 10 (0.07%),TN = 13690 (99.93%)


-----------------------------------------------------------------------
-----------------------------------------------------------------------

[93m[1mBEFORE: classification_report:[0m
--------------------------------
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4438
           1       0.92      0.34      0.49       423

    accuracy                           0.94      4861
   macro avg       0.93      0.67      0.73      4861
weighted avg       0.94      0.94      0.93      4861

[93m[1mAFTER: classification_report:[0m
--------------------------------
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     13700
           1       0.97      0.45      0.61       641

    accuracy                           0.97     14341
   macro avg       0.97      0.72      0.80     14341
weighted avg       0.97      0.97      0.97     14341



In [None]:
feature_measures.measures(education_df)
feat_measures1

In [None]:
dist_measures.measures(education_df)
dist_measures1

In [None]:
agg_measures.measures(education_df)
agg_measures1

## IGNORE BELOW FOR NOW

In [None]:
race_df = undummify(dummy_df, prefix_sep = "-", col_list = ['race'])
data_rebalance_smote =  DataRebalance(race_df, 'race', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
smote_tomek_df = data_rebalance_smote_tomek.Rebalance()


In [None]:
smote_df

In [None]:
# df = undummify(df, "-")
# smote_df = undummify(smote_df, "-")
# smote_tomek_df = undummify(smote_tomek_df, "-")
# tomek_df = undummify(tomek_df, "-")

In [None]:
smote_sex_df = undummify( pd.get_dummies(smote_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
tomek_sex_df = undummify( pd.get_dummies(tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 
smote_tomek_sex_df = undummify( pd.get_dummies(smote_tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
data_rebalance_smote =  DataRebalance(smote_sex_df, 'sex', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(tomek_sex_df, 'sex', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(smote_tomek_sex_df, 'sex', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
# smote_tomek_df = data_rebalance_smote_tomek.Rebalance()

In [None]:
smote_df = undummify()
tomek_df = undummify()
# smote_tomek_df = undummify()

In [None]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure
feat_measures = feature_measures.measures(df)

First look at the feature balance measures for the dataset without applying SMOTE or TOMEK

In [None]:
feat_measures

After applying SMOTE method, these are the feature balance measures

In [None]:
feature_measures.measures(smote_df)

Applying applying the SMOTE-Tomek Method these are the feature balance measures.

In [None]:
feature_measures.measures(smote_tomek_df)

After appying the Tomek Method

In [None]:
feature_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure
dist_measures = DistributionBalanceMeasure(cols_of_interest)
dist_measures.measures(df)


In [None]:
dist_measures.measures(smote_df)

In [None]:
dist_measures.measures(smote_tomek_df)

In [None]:
dist_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure
agg_measures = AggregateBalanceMeasure(cols_of_interest)
agg_measures.measures(df)

In [None]:
agg_measures.measures(smote_df)

In [None]:
agg_measures.measures(smote_tomek_df)

In [None]:
agg_measures.measures(tomek_df)