# Preferential sampling 

In [1]:
import numpy as np
import pandas as pd
import sklearn 
import imblearn
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.metrics import classification_report, confusion_matrix
import math 

In [2]:
german_data = pd.read_csv('german_data.csv')
print(german_data)

     stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                 1               6               5        4           1169   
1                 2              48               3        4           5951   
2                 4              12               5        7           2096   
3                 1              42               3        3           7882   
4                 1              24               4        1           4870   
..              ...             ...             ...      ...            ...   
995               4              12               3        3           1736   
996               1              30               3        2           3857   
997               4              12               3        4            804   
998               1              45               3        4           1845   
999               2              45               5        2           4576   

     Age Group  savings_bonds  employment_since  in

In [3]:
### set sensitive attribute equal to 'SEX' or 'Age Group' ###

sen_att= 'Age Group'

In [4]:
#split data in a test and train set

from sklearn.model_selection import train_test_split
X = german_data.loc[:, german_data.columns != 'approval']
y = german_data.loc[:, german_data.columns == 'approval']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=0)

#apply SMOTE to the trainingset
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)

columns = X_train.columns
os_data_X, os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['approval'])


In [5]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print (len(os_data_y))
print("Number of no approval in oversampled data",len(os_data_y[os_data_y['approval']==0]))
print("Number of approval",len(os_data_y[os_data_y['approval']==1]))
print("Proportion of no approval data in oversampled data is ",len(os_data_y[os_data_y['approval']==0])/len(os_data_X))
print("Proportion of approval data in oversampled data is ",len(os_data_y[os_data_y['approval']==1])/len(os_data_X))

length of oversampled data is  1120
1120
Number of no approval in oversampled data 560
Number of approval 560
Proportion of no approval data in oversampled data is  0.5
Proportion of approval data in oversampled data is  0.5


In [6]:
#ranker 

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(os_data_X, os_data_y)

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
y_pred = knn.predict(os_data_X)


from sklearn.metrics import classification_report
print(classification_report(os_data_y, y_pred))



              precision    recall  f1-score   support

           0       0.64      0.77      0.70       560
           1       0.72      0.57      0.64       560

    accuracy                           0.67      1120
   macro avg       0.68      0.67      0.67      1120
weighted avg       0.68      0.67      0.67      1120



  knn.fit(os_data_X, os_data_y)


In [7]:
matrix = sklearn.metrics.confusion_matrix(os_data_y, y_pred)
sklearn.metrics.accuracy_score(os_data_y, y_pred)
print(matrix)

[[433 127]
 [239 321]]


In [8]:
#result = os_data_X.append(os_data_y, ignore_index=True, sort=False)
result = pd.concat([os_data_X, os_data_y.reindex(os_data_X.index)], axis=1)
prob  = knn.predict_proba(os_data_X)[:,1]


result['prob'] = prob
print(result)
#df_test = [os_data_X, os_data_y, df_prob]

      stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                  2              24               3        1           1246   
1                  1              12               3        1            900   
2                  4               6               3        1            672   
3                  4              10               3        2           2848   
4                  4              48               5       10           7629   
...              ...             ...             ...      ...            ...   
1115               1              24               2        4           3222   
1116               2              33               3        1          12455   
1117               1              20               2        6           3177   
1118               1              12               3        6            684   
1119               1              32               3        2           3454   

      Age Group  savings_bonds  employm

In [9]:
result_sorted = result.sort_values(by=[sen_att, 'prob'], ascending=[True, False], ignore_index=True)
result_sorted["action"] = pd.Series([0 for x in range(len(result_sorted.index))])

result_sorted


Unnamed: 0,stat_check_acc,duration_month,credit_history,purpose,credit_amount,Age Group,savings_bonds,employment_since,installment_in_percent,sex,...,property,other_installment_plans,housing,nr_credits,job,nr_dependants,phone,approval,prob,action
0,2,12,1,9,1410,0,1,3,2,0,...,1,3,2,1,2,1,2,1,1.000000,0
1,2,14,3,10,1410,0,3,5,1,0,...,1,3,2,1,3,1,2,1,1.000000,0
2,1,15,3,1,1403,0,1,3,2,1,...,3,3,1,1,3,1,1,1,1.000000,0
3,4,12,5,10,1412,0,1,3,4,1,...,1,3,2,2,4,1,2,1,1.000000,0
4,4,24,3,4,1413,0,1,3,4,0,...,2,3,2,1,3,1,1,1,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,2,54,1,10,15779,1,1,2,3,0,...,4,2,1,1,3,1,2,0,0.266667,0
1116,2,57,1,10,15307,1,1,3,3,0,...,4,1,2,1,3,1,2,0,0.266667,0
1117,2,48,4,7,6224,1,1,5,4,0,...,4,3,3,1,3,1,1,0,0.200000,0
1118,2,27,3,7,2723,1,2,4,4,0,...,3,1,1,1,2,1,1,0,0.200000,0


In [10]:

S = sen_att
y = 'approval'
sv = 1
nv=  0
df=  result_sorted 
DP = 0
DN = 0
FP = 0
FN = 0
    
for row in range(len(df)) :
    if df.at[row, S] == 1 and df.at[row, y] == 0:
        DN +=1
    elif df.at[row, S] == 1 and df.at[row, y] == 1: 
        DP += 1 
    elif df.at[row, S] == 0 and df.at[row, y] == 0: 
        FN += 1
    else:
        FP += 1 

print(DP, DN, FP, FN )
    #identify needed group sizes
    
df_size = DN + DP + FN + FP
size_group = (df_size/4)
    
print(DN, size_group)
    

72 32 488 528
32 280.0


In [11]:
##### if DN is smaller than size_group, no changes are made ####

df["action"] = pd.Series([0 for x in range(len(df.index))])
    
sort_DN = df.sort_values(by=[ S, 'prob'], ascending=[False, False], ignore_index=True)
dn_count = DN

if DN > size_group:
    for row in range(len(sort_DN)):
        if dn_count > size_group and sort_DN.at[row, S] == sv and sort_DN.at[row, y] == 0:
            sort_DN.at[row, "action"] == 2
            dn_count -= 1
        else:
            pass
        #return sort_DN
    else: 
        pass
    #print(dn_count, DN, size_group)
print (sort_DN)


      stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                  4              13               3        4           1409   
1                  4              12               3        2           1413   
2                  1              12               5        2           1409   
3                  1              30               3        3           3622   
4                  3              12               3        3           1424   
...              ...             ...             ...      ...            ...   
1115               2              36               3        7          12612   
1116               2              19               3        2          12844   
1117               3              24               4        1          12608   
1118               2              23               3        2          12631   
1119               3              19               4        1          12791   

      Age Group  savings_bonds  employm

In [12]:
sort_DN['action'].value_counts()

0    1120
Name: action, dtype: int64

In [13]:

sort_DP = sort_DN.sort_values(by=[S, 'prob'], ascending=[False, True], ignore_index=True)

  
if DP < size_group:
    dp_count = DP
    for row in range(len(sort_DP)):
        if dp_count < size_group and sort_DP.at[row, S] == sv and sort_DP.at[row, y] == 1:
            sort_DP.at[row, "action"] = 1
                #print(sort_DP.at[row, "action"])
            dp_count += 1
        else:
            pass
print(sort_DP)
    
    
 

      stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                  4              60               5        1          13756   
1                  2              48               4        7           6224   
2                  2              27               3        7           2723   
3                  1               6               3        1          14896   
4                  4              12               5        1            682   
...              ...             ...             ...      ...            ...   
1115               1              15               3        1           1403   
1116               4              12               5       10           1412   
1117               4              24               3        4           1413   
1118               2               8               3        4           1414   
1119               4              24               5        4           2872   

      Age Group  savings_bonds  employm

In [14]:
sort_DP['action'].value_counts()

0    1048
1      72
Name: action, dtype: int64

In [15]:
sort_FN = sort_DP.sort_values(by=[sen_att, 'prob'], ascending=[True, False], ignore_index=True) 
            
if FN < size_group:
    fn_count = FN
        
    for row in range(len(sort_FN)):
        if dn_count < size_group and sort_FN.at[row, S] == nv and sort_FN.at[row, y] == 0:
            sort_FN.at[row, "action"] = 1
            fn_count += 1
        else: 
            pass
    else: 
        pass



In [16]:

sort_FP = sort_FN.sort_values(by=[sen_att, 'prob'], ascending=[True, True], ignore_index=True)
    
if FP > size_group: 
    fp_count = FP
    for row in range(len(sort_FP)):
        if fp_count > size_group and sort_FP.at[row, S] == 1 and sort_FP.at[row, y] == 1:
            sort_FP.at[row, "action"] = 2
            fp_count -= 1
        else:
            pass 
            

    else:
        pass 
        


    
## 1 is duplicate 2 is delete     

In [17]:
print(sort_FP)
sort_FP['action'].value_counts()

      stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                  4              21               5        1          12680   
1                  2              18               3        2          12976   
2                  2              24               3        2          12579   
3                  2              36               3        7          12612   
4                  2              19               3        2          12844   
...              ...             ...             ...      ...            ...   
1115               1              30               3        3           3622   
1116               3              12               3        3           1424   
1117               4              13               3        4           1409   
1118               4              12               3        2           1413   
1119               1              12               5        2           1409   

      Age Group  savings_bonds  employm

0    1048
2      72
Name: action, dtype: int64

In [18]:
df_duplicate = sort_FP
df_duplicate = df_duplicate.drop(df_duplicate[df_duplicate.action == 2].index)
df_duplicate = df_duplicate.drop(df_duplicate[df_duplicate.action == 0].index)
print (df_duplicate)

Empty DataFrame
Columns: [stat_check_acc, duration_month, credit_history, purpose, credit_amount, Age Group, savings_bonds, employment_since, installment_in_percent, sex, debtors_guarant, residence_since, property, other_installment_plans, housing, nr_credits, job, nr_dependants, phone, approval, prob, action]
Index: []

[0 rows x 22 columns]


In [19]:
sort_FP.append(df_duplicate)
final_dataset = sort_FP.drop(df_duplicate[df_duplicate.action == 2].index)
final_dataset = final_dataset.drop(columns=['prob', 'action'])
print(final_dataset)

      stat_check_acc  duration_month  credit_history  purpose  credit_amount  \
0                  4              21               5        1          12680   
1                  2              18               3        2          12976   
2                  2              24               3        2          12579   
3                  2              36               3        7          12612   
4                  2              19               3        2          12844   
...              ...             ...             ...      ...            ...   
1115               1              30               3        3           3622   
1116               3              12               3        3           1424   
1117               4              13               3        4           1409   
1118               4              12               3        2           1413   
1119               1              12               5        2           1409   

      Age Group  savings_bonds  employm

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler

X_train = final_dataset.loc[:, final_dataset.columns != 'approval']
y_train = final_dataset.loc[:, final_dataset.columns == 'approval']




logit = LogisticRegression(penalty='l2', solver = 'liblinear', max_iter = 5000)
#logit = LogisticRegression()
logit.fit(X_train, y_train)


y_pred =  logit.predict(X_test)


from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

matrix = sklearn.metrics.confusion_matrix(y_test, y_pred)
accuracy_score = sklearn.metrics.accuracy_score(y_test, y_pred)
print(matrix, accuracy_score)


              precision    recall  f1-score   support

           0       0.58      0.70      0.64        60
           1       0.86      0.79      0.82       140

    accuracy                           0.76       200
   macro avg       0.72      0.74      0.73       200
weighted avg       0.78      0.76      0.77       200

[[ 42  18]
 [ 30 110]] 0.76


  return f(**kwargs)


In [21]:
privileged_groups = [{sen_att: 0}]
unprivileged_groups = [{sen_att: 1}]
cost_constraint = "fnr"
randseed = 12345679 

In [22]:
fair_test_df = pd.concat([X_test, y_test], axis=1, join="inner")


In [23]:
# Metrics function
from collections import OrderedDict
from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

dataset = StandardDataset(fair_test_df, 
                          label_name='approval', 
                          favorable_classes=[1], 
                          protected_attribute_names=[sen_att], 
                          privileged_classes=[[0]])

def fair_metrics(dataset, y_pred, disp = True):
    dataset_pred =dataset.copy()
    dataset_pred.labels = y_pred
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    classified_metric_pred = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metrics = OrderedDict()
    metrics["Balanced accuracy"] = 0.5*(classified_metric_pred.true_positive_rate()+
                                             classified_metric_pred.true_negative_rate())
    metrics["Statistical parity difference"] = classified_metric_pred.statistical_parity_difference()
    metrics["Disparate impact"] = classified_metric_pred.disparate_impact()
    metrics["Average odds difference"] = classified_metric_pred.average_odds_difference()
    metrics["Equal opportunity difference"] = classified_metric_pred.equal_opportunity_difference()
    metrics["Theil index"] = classified_metric_pred.theil_index()
        
    if disp:
        for k in metrics:
            print("%s = %.4f" % (k, metrics[k]))


fair_metrics(dataset, y_pred)

Balanced accuracy = 0.7429
Statistical parity difference = 0.1723
Disparate impact = 1.2783
Average odds difference = -0.0397
Equal opportunity difference = 0.2479
Theil index = 0.1946
