# Bias Mitigation Experiments - Adult

In [1]:
data_name = 'Adult'

In [2]:
# to ignore warnings
import warnings
# warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
# tf.disable_eager_execution() # comment out when running TabTransformer models

## [1.](#Table-of-Contents) Prepare Dataset

In [4]:
# read in data as pandas data frame
# as given by adult.names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                'marital-status', 'occupation', 'relationship',
                'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'income-per-year']
train = pd.read_csv("../data/Adult/adult.data", header=None, names=column_names,
                    skipinitialspace=True, na_values=['?'])
test = pd.read_csv("../data/Adult/adult.test", header=0, names=column_names,
                    skipinitialspace=True, na_values=['?'])
df = pd.concat([train, test]).reset_index(drop=True)
# check columns
pd.set_option('display.max_columns', None) # expand all columns
print(df.shape)
df.head()

(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# unique values of the protected feature race
df['race'].value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

In [6]:
# data pre-processing
import re
label = 'income-per-year'
protected_attribute = 'race'
# 1. select features of interest
df = df.drop(columns=['fnlwgt'], axis=1)

# 2. make sure label from train, test has same formats: e.g. test: '>50K.' vs train: '>50K'
df[label] = df[label].apply(lambda x: re.sub(r'\.+$', '', x))

# 3. encode label and potential protected features 
label_mapping = {'>50K': 1, '<=50K': 0}
race_mapping = {'White': 1, 'Black': 0}
sex_mapping = {'Male': 1, 'Female': 0}

df = df[df.race.isin(race_mapping.keys())] # only include samples with certain race values
df.race.replace(race_mapping, inplace=True)
df.sex.replace(sex_mapping, inplace=True)
df[label].replace(label_mapping, inplace=True)
df = df.reset_index(drop=True)

print(df.shape)
df.head()

(46447, 14)


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,1,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,1,1,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,1,1,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,0,1,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,0,0,0,0,40,Cuba,0


In [7]:
# select descriptive features and target variable
X = df.drop(columns=label, axis=1) # select all features but target feature
y = df[[protected_attribute, label]] # include protected feature in order to AIF360

In [8]:
# set protected attribute as index
X = X.set_index([protected_attribute], append = True, drop = False)
y = y.set_index([protected_attribute], append = True)

# make y data frames to 1d array to pass modeling, but keep index (protected attribute)
y = pd.Series(y[label], index=y.index)

In [9]:
X.to_pickle('../data/{}/{}_X'.format(data_name, data_name))
y.to_pickle('../data/{}/{}_y'.format(data_name, data_name))

In [10]:
df.isna().sum()

age                   0
workclass          2637
education             0
education-num         0
marital-status        0
occupation         2647
relationship          0
race                  0
sex                   0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      705
income-per-year       0
dtype: int64

## [2.](#Table-of-Contents) Proposed GridSearch Approach 

The GridSearch Approach includes hyperparameter, threshold, and Bias Mitigation

In [11]:
from FairGridSearch import *
%load_ext autoreload
%autoreload 2
# allow automatic reloading of changes in FairGridSearch file

  from .autonotebook import tqdm as notebook_tqdm
pip install 'aif360[FairAdapt]'


## 3. Case Study: Adult

In [12]:
label = 'income-per-year'
protected_attribute = 'race'

pos_label = 1
priv_group = 1

cv=10
# n_jobs=multiprocessing.cpu_count()-1
n_jobs=5

In [13]:
X = pd.read_pickle('../data/{}/{}_X'.format(data_name, data_name))
y = pd.read_pickle('../data/{}/{}_y'.format(data_name, data_name))

In [14]:
# define desired metric for the use case
ACC_METRIC = 'avg_norm_mcc_score'
FAIR_METRIC = 'abs_avg_eod_score'

### Logistic Regression

In [16]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'C':[1, 10],'solver':['liblinear', 'saga'],'penalty':['l2']}, 
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':[None,'RW','LFR_pre','LFR_in','AD','EGR','ROC','CEO','RW+ROC','RW+CEO']}

clf_lr = fair_GridsearchCV(base='LR', param_grid=param_grid, 
                           prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                           cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_lr.fit(X=X, y=y)
    
results_lr = clf_lr.output_table
print(clf_lr._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_lr.to_pickle('./{}_results/{}_results_LR'.format(data_name,data_name))
style_table(results_lr)

  0%|          | 0/4 [00:00<?, ?it/s]

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [24:24<1:13:12, 1464.25s/it]

{'C': 1, 'penalty': 'l2', 'solver': 'saga'}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [53:03<53:48, 1614.40s/it]  

{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:18:08<26:04, 1564.54s/it]

{'C': 10, 'penalty': 'l2', 'solver': 'saga'}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [1:46:59<00:00, 1604.85s/it]


base_estimator                                                   LR
param              {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Bias_Mitigation                                                  AD
threshold                                                       0.5
Name: 22, dtype: object
Time:  106.9954618 min


Unnamed: 0,base_estimator,param,Bias_Mitigation,threshold,avg_acc_score,avg_bacc_score,avg_f1_score,avg_auc_score,avg_mcc_score,avg_norm_mcc_score,avg_spd_score,avg_aod_score,avg_eod_score,avg_ford_score,avg_ppvd_score,avg_(1-consistency_score),avg_gei_score,avg_ti_score,cost
0,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",,0.3,0.828622,0.814825,0.688789,0.906375,0.581954,0.790977,-0.191015,-0.111049,-0.115312,-0.032799,-0.018943,0.075884,0.07278,0.088909,0.400038
1,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",,0.4,0.846836,0.794773,0.685639,0.906375,0.584592,0.792296,-0.160238,-0.103517,-0.132392,-0.044665,-0.009235,0.073572,0.075612,0.103611,0.367942
2,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",,0.5,0.850927,0.764566,0.65866,0.906375,0.569445,0.784723,-0.131249,-0.094916,-0.140627,-0.056435,-0.019154,0.070153,0.080521,0.121287,0.346527
3,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",,0.6,0.847568,0.729333,0.612766,0.906375,0.544357,0.772179,-0.10732,-0.078318,-0.121593,-0.070686,0.026495,0.062032,0.086931,0.140859,0.335141
4,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",,0.7,0.836545,0.683978,0.534304,0.906375,0.501633,0.750817,-0.080301,-0.068118,-0.117692,-0.083294,0.023542,0.054569,0.096814,0.165934,0.329485
5,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",RW,0.3,0.82776,0.812271,0.68611,0.904716,0.578224,0.789112,-0.097973,0.023797,0.07572,-0.059758,-0.15312,0.077723,0.073542,0.090309,0.308861
6,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",RW,0.4,0.846212,0.792895,0.683431,0.904716,0.582003,0.791001,-0.079222,0.036889,0.088004,-0.073713,-0.150786,0.076853,0.076149,0.10464,0.28822
7,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",RW,0.5,0.850152,0.762649,0.655961,0.904716,0.566636,0.783318,-0.068268,0.03534,0.080324,-0.084377,-0.136474,0.071712,0.081091,0.122332,0.28495
8,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",RW,0.6,0.846018,0.725316,0.606169,0.904716,0.538722,0.769361,-0.053239,0.039685,0.083479,-0.096114,-0.123982,0.063857,0.088088,0.143078,0.283879
9,LR,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",RW,0.7,0.834284,0.67983,0.525884,0.904716,0.493205,0.746602,-0.035392,0.040269,0.077105,-0.106493,-0.14312,0.056128,0.09839,0.168308,0.28879


### Random Forest

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'n_estimators':[10, 50],'criterion':['gini', 'entropy'], 'max_depth':[16]}, 
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':[None,'RW','LFR_pre','EGR','ROC','CEO','RW+ROC','RW+CEO']}

clf_rf = fair_GridsearchCV(base='RF', param_grid=param_grid,
                           prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                           cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_rf.fit(X=X, y=y)
    
results_rf = clf_rf.output_table
print(clf_rf._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_rf.to_pickle('./{}_results/{}_results_RF'.format(data_name, data_name))
style_table(results_rf)

### Gradient Boosting

In [17]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'n_estimators':[10, 50],'criterion':['friedman_mse'],'max_depth':[8, 32]}, 
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':[None,'RW','LFR_pre','EGR','ROC','CEO','RW+ROC','RW+CEO']}

clf_gb = fair_GridsearchCV(base='GB', param_grid=param_grid,
                           prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                           cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_gb.fit(X=X, y=y)
    
results_gb = clf_gb.output_table
print(clf_gb._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_gb.to_pickle('./{}_results/{}_results_GB'.format(data_name, data_name))
# style_table(results_gb)

  0%|          | 0/4 [00:00<?, ?it/s]

{'criterion': 'friedman_mse', 'max_depth': 8, 'n_estimators': 10}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [15:47<47:22, 947.57s/it]

{'criterion': 'friedman_mse', 'max_depth': 8, 'n_estimators': 50}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [49:54<53:08, 1594.46s/it]

{'criterion': 'friedman_mse', 'max_depth': 32, 'n_estimators': 10}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:21:55<29:03, 1743.31s/it]

{'criterion': 'friedman_mse', 'max_depth': 32, 'n_estimators': 50}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [3:37:09<00:00, 3257.41s/it]


base_estimator                                                    GB
param              {'criterion': 'friedman_mse', 'max_depth': 8, ...
Bias_Mitigation                                               RW+ROC
threshold                                                        0.4
Name: 71, dtype: object
Time:  217.16786271666666 min


### Support Vector Machine

In [None]:
# import timeit
# start = timeit.default_timer()

# param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
#               'threshold': np.linspace(0.3, 0.7, 5),
#               'Bias_Mitigation':[None,'RW','LFR_pre','EGR','ROC','CEO','RW+ROC','RW+CEO']}

# clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
#                             prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
#                             cv=cv, n_jobs=n_jobs)
# if __name__ == '__main__':
#     clf_svm.fit(X=X, y=y)
    
# results_svm = clf_svm.output_table
# print(clf_svm._best_param)

# stop = timeit.default_timer()
# runtime = stop - start
# if runtime < 60:
#     print('Time: ', runtime, 'sec')
# else: print('Time: ', runtime/60, 'min')
# results_svm.to_pickle('./{}_results/{}_results_SVM'.format(data_name, data_name))
# style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':[None]}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_None'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['RW']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_RW'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['LFR_pre']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_LFR_pre'.format(data_name, data_name))
style_table(results_svm)

##### the code above took around 5 hours to run

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['LFR_pre']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_LFR_pre2'.format(data_name, data_name))
style_table(results_svm)

##### the code above took around 11 hours to run

In [None]:
# EGR: 17 hr 47 min

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['EGR']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_EGR'.format(data_name, data_name))
style_table(results_svm)

In [None]:
# EGR2 starts at 13:48, ends at 19:30, with around 5.5 hr pause
# took around 72 hr

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['linear'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['EGR']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_EGR2'.format(data_name, data_name))
style_table(results_svm)

In [None]:
# EGR3 started 11:01, ended 4:06, paused for around 2 hr, took around 15 hr

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['poly'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['EGR']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_EGR3'.format(data_name, data_name))
style_table(results_svm)

In [None]:
# EGER4 started 20:08, ended 9:13 (+1), took around 13 hr

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['EGR']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_EGR4'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['ROC']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_ROC'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['CEO']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_CEO'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['RW+ROC']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_RWROC'.format(data_name, data_name))
style_table(results_svm)

In [None]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'kernel':['rbf','linear','poly','sigmoid'],'gamma':['scale']},
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':['RW+CEO']}

clf_svm = fair_GridsearchCV(base='SVM', param_grid=param_grid,
                            prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                            cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_svm.fit(X=X, y=y)
    
results_svm = clf_svm.output_table
print(clf_svm._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_svm.to_pickle('./{}_results/{}_results_SVM_RWCEO'.format(data_name, data_name))
style_table(results_svm)

### Gaussian Naive Bayes

In [15]:
import timeit
start = timeit.default_timer()

param_grid = {'hyperp_grid': {'var_smoothing': np.logspace(0,-9, num=4)}, 
              'threshold': np.linspace(0.3, 0.7, 5),
              'Bias_Mitigation':[None,'RW','LFR_pre','EGR','ROC','CEO','RW+ROC','RW+CEO']}

clf_nb = fair_GridsearchCV(base='NB', param_grid=param_grid, 
                           prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                           cv=cv, n_jobs=n_jobs)
if __name__ == '__main__':
    clf_nb.fit(X=X, y=y)
    
results_nb = clf_nb.output_table
# print(clf._best_param)

stop = timeit.default_timer()
runtime = stop - start
if runtime < 60:
    print('Time: ', runtime, 'sec')
else: print('Time: ', runtime/60, 'min')
results_nb.to_pickle('./{}_results/{}_results_NB'.format(data_name, data_name))
style_table(results_nb)

  0%|          | 0/4 [00:00<?, ?it/s]

{'var_smoothing': 1.0}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [12:04<36:14, 724.82s/it]

{'var_smoothing': 0.001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [23:13<23:03, 691.69s/it]

{'var_smoothing': 1e-06}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [34:27<11:23, 683.66s/it]

{'var_smoothing': 1e-09}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [45:34<00:00, 683.54s/it]


Time:  45.578280836666664 min


Unnamed: 0,base_estimator,param,Bias_Mitigation,threshold,avg_acc_score,avg_bacc_score,avg_f1_score,avg_auc_score,avg_mcc_score,avg_norm_mcc_score,avg_spd_score,avg_aod_score,avg_eod_score,avg_ford_score,avg_ppvd_score,avg_(1-consistency_score),avg_gei_score,avg_ti_score,cost
0,NB,{'var_smoothing': 1.0},,0.3,0.543587,0.684619,0.502264,0.85614,0.337281,0.668641,-0.290092,-0.165623,-0.061645,-0.013621,-0.084306,0.036472,0.064667,0.0691,0.621451
1,NB,{'var_smoothing': 1.0},,0.4,0.567292,0.69885,0.514578,0.85614,0.356569,0.678284,-0.284695,-0.160247,-0.06149,-0.013986,-0.083345,0.037505,0.066517,0.070683,0.60641
2,NB,{'var_smoothing': 1.0},,0.5,0.588778,0.711498,0.52612,0.85614,0.373933,0.686966,-0.28599,-0.159382,-0.062306,-0.014637,-0.077687,0.038013,0.068104,0.072077,0.599024
3,NB,{'var_smoothing': 1.0},,0.6,0.608887,0.723391,0.537519,0.85614,0.390716,0.695358,-0.289978,-0.163355,-0.070589,-0.013543,-0.071501,0.03887,0.06937,0.07313,0.59462
4,NB,{'var_smoothing': 1.0},,0.7,0.629599,0.735499,0.54984,0.85614,0.40828,0.70414,-0.289704,-0.164627,-0.078453,-0.012475,-0.06656,0.038668,0.070483,0.074057,0.585564
5,NB,{'var_smoothing': 1.0},RW,0.3,0.557646,0.69308,0.509524,0.856047,0.348709,0.674355,-0.19367,-0.095382,-0.030634,-0.017693,-0.126761,0.036368,0.065776,0.070047,0.519316
6,NB,{'var_smoothing': 1.0},RW,0.4,0.581286,0.706993,0.521934,0.856047,0.367616,0.683808,-0.199201,-0.095702,-0.028938,-0.018976,-0.12405,0.036769,0.067622,0.071693,0.515393
7,NB,{'var_smoothing': 1.0},RW,0.5,0.601223,0.719018,0.53321,0.856047,0.384578,0.692289,-0.201676,-0.096058,-0.030851,-0.018585,-0.123166,0.037152,0.068852,0.072648,0.509387
8,NB,{'var_smoothing': 1.0},RW,0.6,0.621159,0.730951,0.54502,0.856047,0.401804,0.700902,-0.200505,-0.092731,-0.028989,-0.019098,-0.122514,0.037225,0.069904,0.073441,0.499603
9,NB,{'var_smoothing': 1.0},RW,0.7,0.641225,0.742205,0.55702,0.856047,0.418275,0.709137,-0.20219,-0.092688,-0.030658,-0.019906,-0.12045,0.03766,0.071019,0.074514,0.493053


### TabTransformer

In [None]:
# TabTrans
# None: 5 hr
# RW: 6hr
# ROC: 5hr
# CEO: 6hr
# RW+ROC: 6.5hr
# RW+CEO: 6hr

In [15]:
import timeit

Bias_Mitigation=[None,'RW','ROC','CEO','RW+ROC','RW+CEO']
# Bias_Mitigation=['RW+ROC','RW+CEO']
for BiasM in Bias_Mitigation:
    start = timeit.default_timer()
    
    param_grid = {'hyperp_grid': {'epochs':[20, 30],'learing_rate':[1e-04, 1e-05]},
                  'threshold': np.linspace(0.3, 0.7, 5),
                  'Bias_Mitigation':[BiasM]}

    clf_tab = fair_GridsearchCV(base='TabTrans',param_grid=param_grid, 
                                prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
                                cv=cv, n_jobs=n_jobs)

    clf_tab.fit(X=X, y=y)
    results_tab = clf_tab.output_table
    # print(clf._best_param)

    stop = timeit.default_timer()
    runtime = stop - start
    if runtime < 60:
        print('Time: ', runtime, 'sec')
    else: print('Time: ', runtime/60, 'min')
    results_tab.to_pickle('./{}_results/{}_results_TabTrans_{}'.format(data_name, data_name, BiasM))

    style_table(results_tab)

  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [30:52<1:32:36, 1852.24s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [58:12<57:35, 1727.66s/it]  

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:42:17<35:46, 2146.47s/it]

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:24:10<00:00, 2162.72s/it]


Time:  144.18575866166665 min


  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [30:19<1:30:59, 1819.85s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [59:01<58:44, 1762.05s/it]  

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:42:59<36:02, 2162.18s/it]

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:27:05<00:00, 2206.33s/it]


Time:  147.09125131166667 min


  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [30:00<1:30:01, 1800.62s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [1:00:25<1:00:29, 1814.92s/it]

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:45:17<36:55, 2215.41s/it]  

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:29:24<00:00, 2241.02s/it]


Time:  149.40282366499997 min


  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [30:26<1:31:19, 1826.64s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [59:17<59:00, 1770.01s/it]  

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:44:00<36:27, 2187.32s/it]

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:28:35<00:00, 2228.86s/it]


Time:  148.59232773833338 min


  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [30:02<1:30:06, 1802.26s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [1:00:13<1:00:15, 1807.64s/it]

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:44:37<36:38, 2198.76s/it]  

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:28:55<00:00, 2233.95s/it]


Time:  148.93107813 min


  0%|          | 0/4 [00:00<?, ?it/s]

{'epochs': 20, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 25%|██▌       | 1/4 [29:58<1:29:54, 1798.25s/it]

{'epochs': 20, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 50%|█████     | 2/4 [1:00:08<1:00:10, 1805.19s/it]

{'epochs': 30, 'learing_rate': 0.0001}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


 75%|███████▌  | 3/4 [1:45:02<36:51, 2211.16s/it]  

{'epochs': 30, 'learing_rate': 1e-05}
------------------------------------------------------------------------------------------
start multiprocessing
------------------------------------------------------------------------------------------


100%|██████████| 4/4 [2:26:23<00:00, 2195.90s/it]


Time:  146.3956986516667 min


In [None]:
# original
# import timeit
# start = timeit.default_timer()

# param_grid = {'hyperp_grid': {'epochs':[20, 30],'learing_rate':[1e-04, 1e-05]},
#               'threshold': np.linspace(0.3, 0.7, 5),
#               'Bias_Mitigation':[None,'RW','ROC','CEO','RW+ROC','RW+CEO']}

# clf_tab = fair_GridsearchCV(base='TabTrans',param_grid=param_grid, 
#                             prot_attr=protected_attribute, pos_label=pos_label, priv_group=priv_group,
#                             cv=cv, n_jobs=n_jobs)

# clf_tab.fit(X=X, y=y)
# results_tab = clf_tab.output_table
# # print(clf._best_param)

# stop = timeit.default_timer()
# runtime = stop - start
# if runtime < 60:
#     print('Time: ', runtime, 'sec')
# else: print('Time: ', runtime/60, 'min')
# results_tab.to_pickle('./{}_results/{}_results_TabTrans'.format(data_name, data_name))

# style_table(results_tab)