In [1]:
import numpy as np
import pandas as pd
import sklearn 
import imblearn
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.metrics import classification_report, confusion_matrix
import math 

In [2]:
### define the sensitive attribute 'Age Group' or 'SEX '###
sen_att = 'SEX'

In [3]:
taiwan_data = pd.read_csv('taiwan_data.csv')
print(taiwan_data)

       LIMIT_BAL  SEX  EDUCATION  MARRIAGE  Age Group  PAY_0  PAY_2  PAY_3  \
0          20000    1          2         1          0      2      2     -1   
1         120000    1          2         2          0     -1      2      0   
2          90000    1          2         2          0      0      0      0   
3          50000    1          2         1          0      0      0      0   
4          50000    0          2         1          1     -1      0     -1   
...          ...  ...        ...       ...        ...    ...    ...    ...   
29995     220000    0          3         1          0      0      0      0   
29996     150000    0          3         2          0     -1     -1     -1   
29997      30000    0          2         2          0      4      3      2   
29998      80000    0          3         1          0      1     -1      0   
29999      50000    0          2         1          0      0      0      0   

       PAY_4  PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_

In [4]:
#split data in a test and train set

from sklearn.model_selection import train_test_split
X = taiwan_data.loc[:, taiwan_data.columns != 'Creditworthiness']
y = taiwan_data.loc[:, taiwan_data.columns == 'Creditworthiness']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=0)

### remove the sensitive attributes for execution but store them for metrics later ###

X_test_sen = X_test.loc[:,X_test.columns == sen_att]
X_test = X_test.loc[:, X_test.columns != sen_att]

X_train = X_train.loc[:, X_train.columns != sen_att]

#apply SMOTE to the trainingset
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)

columns = X_train.columns
os_data_X, os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['Creditworthiness'])



In [5]:
#print(os_data_X)


In [6]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print (len(os_data_y))
print("Number of no approval in oversampled data",len(os_data_y[os_data_y['Creditworthiness']==0]))
print("Number of approval",len(os_data_y[os_data_y['Creditworthiness']==1]))
print("Proportion of no approval data in oversampled data is ",len(os_data_y[os_data_y['Creditworthiness']==0])/len(os_data_X))
print("Proportion of approval data in oversampled data is ",len(os_data_y[os_data_y['Creditworthiness']==1])/len(os_data_X))

length of oversampled data is  37382
37382
Number of no approval in oversampled data 18691
Number of approval 18691
Proportion of no approval data in oversampled data is  0.5
Proportion of approval data in oversampled data is  0.5


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler




os_data_y_2 =os_data_y.to_numpy()
os_data_y_2 =os_data_y_2.ravel()

logit = LogisticRegression(penalty='l2', solver = 'liblinear', max_iter = 5000)
#logit = LogisticRegression()
logit.fit(os_data_X, os_data_y_2)


y_pred =  logit.predict(X_test)


from sklearn.metrics import classification_report


matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
sklearn.metrics.accuracy_score(y_test, y_pred)
print(matrix)

[[ 461  866]
 [ 988 3685]]


In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.32      0.35      0.33      1327
           1       0.81      0.79      0.80      4673

    accuracy                           0.69      6000
   macro avg       0.56      0.57      0.57      6000
weighted avg       0.70      0.69      0.70      6000



In [9]:
privileged_groups = [{sen_att: 0}]
unprivileged_groups = [{sen_att: 1}]
cost_constraint = "fnr"
randseed = 12345679 

In [10]:
### add sensitive attribute back for fairness metrics  ###
fair_test_df = pd.concat([X_test, X_test_sen], axis=1, join="inner")
fair_test_df = pd.concat([fair_test_df, y_test], axis=1, join="inner")

In [11]:
# Metrics functionZ
from collections import OrderedDict
from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

dataset = StandardDataset(fair_test_df, 
                          label_name='Creditworthiness', 
                          favorable_classes=[1], 
                          protected_attribute_names=[sen_att], 
                          privileged_classes=[[0]])

def fair_metrics(dataset, y_pred, disp = True):
    dataset_pred =dataset.copy()
    dataset_pred.labels = y_pred
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    classified_metric_pred = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metrics = OrderedDict()
    metrics["Balanced accuracy"] = 0.5*(classified_metric_pred.true_positive_rate()+
                                             classified_metric_pred.true_negative_rate())
    metrics["Statistical parity difference"] = classified_metric_pred.statistical_parity_difference()
    metrics["Disparate impact"] = classified_metric_pred.disparate_impact()
    metrics["Average odds difference"] = classified_metric_pred.average_odds_difference()
    metrics["Equal opportunity difference"] = classified_metric_pred.equal_opportunity_difference()
  
    if disp:
        for k in metrics:
            print("%s = %.4f" % (k, metrics[k]))


fair_metrics(dataset, y_pred)

Balanced accuracy = 0.5680
Statistical parity difference = 0.0142
Disparate impact = 1.0190
Average odds difference = -0.0000
Equal opportunity difference = 0.0128
