In [None]:
import pandas as pd
import sys
sys.path.append('../../ResponsibleAIToolbox-Mitigation/')
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from errorsmitigation.dataprocessing import DataRebalance
from errorsmitigation.dataprocessing import DataSplit
from databalanceanalysis.databalanceanalysis.utils import undummify

from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
   
data_dir = '../datasets/hr_promotion'
df =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
cols_of_interest = ['education', 'gender']
label_col = 'is_promoted'
seed = 42
# handle duplicates
df = df.drop_duplicates()
df = df.dropna()

df.shape

In [None]:
## Train a model and get accuracy numbers

# data prep
def split_label(dataset):
    x = dataset.drop(['is_promoted'], axis=1)
    y = dataset['is_promoted']
    return x, y

dataset = pd.get_dummies(df, drop_first=False)
target_index = dataset.columns.get_loc('is_promoted')
data_split =  DataSplit(dataset,target_index , 0.9, 42, True, False, False, True)
train_data, test_data = data_split.Split()
# splitting the training data
x_train, y_train = split_label(train_data)
# splitting the test data
x_test, y_test = split_label(test_data)

# LGBMClassifier Model
clf = LGBMClassifier(n_estimators=50)
model = clf.fit(x_train, y_train)

pred = model.predict(x_test)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])

print("number of errors on test dataset: " + str(sum(pred != y_test)))

conf_matrix(y_test,pred)

print(classification_report(y_test, pred)) 


In [None]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure

feature_measures = FeatureBalanceMeasure( cols_of_interest, label_col)

feat_measures1 = feature_measures.measures(df)
feat_measures1

In [None]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure

dist_measures = DistributionBalanceMeasure( cols_of_interest)
dist_measures1 = dist_measures.measures(df)
dist_measures1


In [None]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure

agg_measures = AggregateBalanceMeasure( cols_of_interest)
agg_measures1 = agg_measures.measures(df)
agg_measures1

In [None]:
smote_tomek = SMOTETomek()
smote = SMOTE()
tomek = TomekLinks()
dummy_df = pd.get_dummies(df, prefix_sep = "-")
dummy_df.head()

In [None]:
gender_df = undummify(dummy_df, prefix_sep = "-", col = 'gender')
gender_df

In [None]:
data_balance_smote =  DataRebalance(gender_df, 'gender', 'auto', 42, None, smote, None)

print(gender_df.shape)
smote_df = data_balance_smote.Rebalance()
print(smote_df.shape)
# smote_df
# print(smote_df.head)


In [None]:
# print(smote_df.shape)
# smote_df.head()
# dummy_df = pd.get_dummies(smote_df, prefix_sep ="-") # not required
# print(dummy_df.shape)
# dummy_df.head()

smote_df.head()
education_df = undummify(smote_df, prefix_sep = "-", col = 'education')
education_df.head()

education_df['education'].value_counts()


In [None]:
data_balance_smote_2 =  DataRebalance(education_df, 'education', 'auto', 42, None, smote, None)

print(education_df.shape)
smote_df_2 = data_balance_smote_2.Rebalance()
print(smote_df_2.shape)

In [None]:

target_index = smote_df_2.columns.get_loc('is_promoted')
data_split =  DataSplit(smote_df_2,target_index , 0.9, 42, False, False, False, True)
train_data, test_data = data_split.Split()
# splitting the training data
x_train2, y_train2 = split_label(train_data)
# splitting the test data
x_test2, y_test2 = split_label(test_data)

# LGBMClassifier Model
clf2 = LGBMClassifier(n_estimators=50)
model2 = clf2.fit(x_train2, y_train2)

pred2 = model2.predict(x_test2)

def conf_matrix(y,pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr,fpr),(fnr,tpr))= metrics.confusion_matrix(y, pred, normalize='true')
    return pd.DataFrame([[f'TP = {tp} ({tpr:1.2%})', f'FN = {fn} ({fnr:1.2%})'], 
                         [f'FP = {fp} ({fpr:1.2%})', f'TN = {tn} ({tnr:1.2%})']],
                        index=['True', 'False'], 
                        columns=['Pred 1', 'Pred 0'])


In [None]:
# Compare Results
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


print('')
print(color.PURPLE + color.BOLD + "BEFORE: " + color.END + "number of test dataset instances: " + color.BOLD   + color.GREEN + str(len(y_test)) + color.END)
print("      : number of errors on test dataset: " + color.BOLD   + color.RED + str(sum(pred != y_test)) + color.END)
print('')
print(color.PURPLE + color.BOLD + "AFTER:  " + color.END + "number of test dataset instances: " + color.BOLD   + color.GREEN + str(len(y_test2)) + color.END)
print("     :  number of errors on test dataset: " + color.BOLD  + color.RED + str(sum(pred2 != y_test2)) + color.END)
print('')
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
print('')
print(color.BLUE + color.BOLD +"BEFORE: conf_matrix:" + color.END)
print("--------------------")
conf_matrix(y_test,pred) 
print('')
print(color.BLUE + color.BOLD +"AFTER: conf_matrix:" + color.END)
print("-------------------")
conf_matrix(y_test2,pred2)
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
print('')
print(color.YELLOW + color.BOLD +"BEFORE: classification_report:" + color.END)
print("--------------------------------")
print(classification_report(y_test, pred)) 
print(color.YELLOW + color.BOLD +"AFTER: classification_report:" + color.END)
print("--------------------------------")
print(classification_report(y_test2, pred2)) 
      

In [None]:
feature_measures.measures(education_df)
feat_measures1

In [None]:
dist_measures.measures(education_df)
dist_measures1

In [None]:
agg_measures.measures(education_df)
agg_measures1

## IGNORE BELOW FOR NOW

In [None]:
race_df = undummify(dummy_df, prefix_sep = "-", col_list = ['race'])
data_rebalance_smote =  DataRebalance(race_df, 'race', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
smote_tomek_df = data_rebalance_smote_tomek.Rebalance()


In [None]:
smote_df

In [None]:
# df = undummify(df, "-")
# smote_df = undummify(smote_df, "-")
# smote_tomek_df = undummify(smote_tomek_df, "-")
# tomek_df = undummify(tomek_df, "-")

In [None]:
smote_sex_df = undummify( pd.get_dummies(smote_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
tomek_sex_df = undummify( pd.get_dummies(tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 
smote_tomek_sex_df = undummify( pd.get_dummies(smote_tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
data_rebalance_smote =  DataRebalance(smote_sex_df, 'sex', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(tomek_sex_df, 'sex', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(smote_tomek_sex_df, 'sex', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
# smote_tomek_df = data_rebalance_smote_tomek.Rebalance()

In [None]:
smote_df = undummify()
tomek_df = undummify()
# smote_tomek_df = undummify()

In [None]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure
feat_measures = feature_measures.measures(df)

First look at the feature balance measures for the dataset without applying SMOTE or TOMEK

In [None]:
feat_measures

After applying SMOTE method, these are the feature balance measures

In [None]:
feature_measures.measures(smote_df)

Applying applying the SMOTE-Tomek Method these are the feature balance measures.

In [None]:
feature_measures.measures(smote_tomek_df)

After appying the Tomek Method

In [None]:
feature_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure
dist_measures = DistributionBalanceMeasure(cols_of_interest)
dist_measures.measures(df)


In [None]:
dist_measures.measures(smote_df)

In [None]:
dist_measures.measures(smote_tomek_df)

In [None]:
dist_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure
agg_measures = AggregateBalanceMeasure(cols_of_interest)
agg_measures.measures(df)

In [None]:
agg_measures.measures(smote_df)

In [None]:
agg_measures.measures(smote_tomek_df)

In [None]:
agg_measures.measures(tomek_df)