In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score
from scipy.spatial.distance import jensenshannon
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_X_y

from iFair import iFair

In [42]:
def encoder(X, adult=False):
    X_enc = X.copy()
    if adult:
        X_enc = X_enc.iloc[:, 1:]
    X_enc = X_enc.iloc[:, :-1]
    enc = OrdinalEncoder()
    X_enc = enc.fit_transform(X_enc)
    if adult:
        X_enc = pd.DataFrame(X_enc, columns=X.columns[1:-1])
    else:
        X_enc = pd.DataFrame(X_enc, columns=X.columns[:-1])
    return X_enc.join(X.iloc[:, -1])

def normalize_data(X):
    scaler = MinMaxScaler()
    scaler.fit(X)
    return pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)

In [23]:
def consistency(X, y, n_neighbors=5):
    X, y = check_X_y(X, y)
    neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors.fit(X)
    idx = neighbors.kneighbors(X, return_distance=False)
    return 1 - abs(y - y[idx].mean(axis=1)).mean()

In [24]:
def auc_func(fair_Xs_train, ys_train, Xs_test, ys_test):
    aucs = []
    clfs = []
    consistencies = []
    for i in range(len(fair_Xs_train)):
        X_train, y_train = fair_Xs_train[i], np.array(list(map(float, ys_train[i])))
        X_test, y_test = Xs_test[i], ys_test[i]
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        aucs.append(roc_auc_score(y_test, y_pred))
        consistencies.append(consistency(X_test, y_pred))
        clfs.append(clf)
    return aucs, clfs, consistencies

Running COMPAS samples

In [29]:
compas_samples = ['samples/compas_{}'.format(i) for i in range(15)]
fair_Xs_train ,ys_train, Xs_test, ys_test = [], [], [], []

for sample in compas_samples:
    compas = pd.read_csv(sample)

    compas['two_year_recid'].replace({'Yes': 0,
                                      'No': 1}, inplace=True)

    X_compas = compas[compas.columns]
    y_compas = compas[compas.columns[-1]]

    protected_attr = X_compas.pop('race')
    X_compas.insert(len(X_compas.columns)-1, protected_attr.name, protected_attr)

    X_enc = encoder(compas)
    normalize_data(X_enc)
    X_train, X_test, y_train, y_test = train_test_split(X_enc[X_enc.columns[:-1]], X_enc['two_year_recid'], train_size=0.8, stratify=X_enc['two_year_recid'])
    X_train = np.array(X_train)

    fair = iFair()
    fair_Xs_train.append(fair.fit_transform(X_train))
    ys_train.append(np.array(y_train))
    Xs_test.append(X_test)
    ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

In [30]:
aucs, clfs_compas, consis_compas = auc_func(fair_Xs_train ,ys_train, Xs_test, ys_test)
np.mean(aucs), np.mean(consis_compas)

(0.5116554470662122, 0.9584)

Running German samples

In [31]:
german_samples = ['samples/german_{}'.format(i) for i in range(15)]
german_fair_Xs_train, german_ys_train, german_Xs_test, german_ys_test = [], [], [], []

for sample in german_samples:
    german = pd.read_csv(sample)
    #del german['Unnamed: 0']

    german['classification'].replace({'Bad': 0,
                                      'Good': 1}, inplace=True)

    german['sex'].replace({'Female': 0,
                           'Male': 1}, inplace=True)

    X_german = german[german.columns]
    y_german = german[german.columns[-1]]

    protected_attr = X_german.pop('sex')
    X_german.insert(len(X_german.columns)-1, protected_attr.name, protected_attr)

    oh_german = encoder(X_german)
    oh_german = normalize_data(oh_german)
    X_train, X_test, y_train, y_test = train_test_split(oh_german[oh_german.columns[:-1]], oh_german['classification'], train_size=0.8, stratify=oh_german['classification'])
    X_train = np.array(X_train)

    fair = iFair()
    german_fair_Xs_train.append(fair.fit_transform(X_train))
    german_ys_train.append(np.array(y_train))
    german_Xs_test.append(X_test)
    german_ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


In [32]:
german_aucs, german_clfs, german_consis = auc_func(german_fair_Xs_train, german_ys_train, german_Xs_test, german_ys_test)
np.mean(german_aucs), np.mean(german_consis)

(0.48896537277504964, 0.8250666666666667)

In [44]:
adult_samples = ['samples/adult_{}'.format(i) for i in range(15)]
adult_fair_Xs_train, adult_ys_train, adult_Xs_test, adult_ys_test = [], [], [], []

for sample in adult_samples:
    adult = pd.read_csv(sample)
    #del adult['Unnamed: 0']

    adult['income'].replace({'<=50K': 0,
                             '>50K': 1}, inplace=True)

    adult['gender'].replace({'Female': 0,
                             'Male': 1}, inplace=True)
    
    adult.dropna(inplace=True)
    adult.reset_index(drop=True, inplace=True)

    X_adult = adult[adult.columns]
    y_adult = adult[adult.columns[-1]]

    protected_attr = X_adult.pop('gender')
    X_adult.insert(len(X_adult.columns)-1, protected_attr.name, protected_attr)

    oh_adult = encoder(X_adult, adult=True)
    oh_adult = normalize_data(oh_adult)
    X_train, X_test, y_train, y_test = train_test_split(oh_adult[oh_adult.columns[:-1]], oh_adult['income'], train_size=0.8, stratify=oh_adult['income'])
    X_train = np.array(X_train)

    fair = iFair()
    adult_fair_Xs_train.append(fair.fit_transform(X_train))
    adult_ys_train.append(np.array(y_train))
    adult_Xs_test.append(X_test)
    adult_ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


In [45]:
adult_aucs, adult_clfs, adult_consis = auc_func(adult_fair_Xs_train, adult_ys_train, adult_Xs_test, adult_ys_test)
np.mean(adult_aucs), np.mean(adult_consis)

(0.4845672492000754, 0.8891508113691561)

In [46]:
auc_means = {'German': np.mean(german_aucs),
             'Adult': np.mean(adult_aucs),
             'COMPAS': np.mean(aucs)}

cons_means = {'German': np.mean(german_consis),
              'Adult': np.mean(adult_consis),
              'COMPAS': np.mean(consis_compas)}

In [47]:
auc_means, cons_means

({'German': 0.48896537277504964,
  'Adult': 0.4845672492000754,
  'COMPAS': 0.5116554470662122},
 {'German': 0.8250666666666667, 'Adult': 0.8891508113691561, 'COMPAS': 0.9584})