In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from scipy.spatial import distance
from scipy.stats import wasserstein_distance
from matplotlib import pyplot as plt
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_X_y

from iFair import iFair

In [2]:
def one_hot_encoder(X):
    numeric_cols = X[X.columns[:-1]]._get_numeric_data().columns
    categ_cols = list(set(X.columns[:-1]) ^ set(numeric_cols))
    encoded_list = []
    one_hot_attrs = set()

    for cat in categ_cols:
        encoded_column = pd.get_dummies(X[cat])
        encoded_list.append(encoded_column)
        for col in encoded_column.columns:
            one_hot_attrs.add(col)

    df = X[numeric_cols]
    for encoded in encoded_list:
        df = df.join([encoded])

    return df.join(X[X.columns[-1]])

def normalize_data(X):
    scaler = MinMaxScaler()
    scaler.fit(X)
    return pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)

Running COMPAS samples

In [3]:
compas_samples = ['samples/compas_{}'.format(i) for i in range(15)]
fair_Xs_train ,ys_train, Xs_test, ys_test = [], [], [], []

for sample in compas_samples:
    compas = pd.read_csv(sample)
    del compas['Unnamed: 0']

    compas['two_year_recid'].replace({'Yes': 0,
                                    'No': 1}, inplace=True)

    X_compas = compas[compas.columns]
    y_compas = compas[compas.columns[-1]]

    protected_attr = X_compas.pop('race')
    X_compas.insert(len(X_compas.columns)-1, protected_attr.name, protected_attr)

    oh_compas = one_hot_encoder(X_compas)
    oh_compas = normalize_data(oh_compas)
    X_train, X_test, y_train, y_test = train_test_split(oh_compas[oh_compas.columns[:-1]], oh_compas['two_year_recid'], train_size=0.7, stratify=oh_compas['two_year_recid'])
    X_train = np.array(X_train)

    fair = iFair() # como definir as colunas que representam os atributos sensiveis?
    fair_Xs_train.append(fair.fit_transform(X_train))
    ys_train.append(np.array(y_train))
    Xs_test.append(X_test)
    ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...


Compilation is falling back to object mode WITH looplifting enabled because Function "iFair" failed type inference due to: [1m[1mUnknown attribute 'iters' of type recursive(type(CPUDispatcher(<function iFair at 0x000001F730464AE8>)))
[1m
File "iFair_impl\lowrank_helpers.py", line 98:[0m
[1mdef iFair(params, X, D_X_f=0, k=10, A_x=1e-4, A_z=1e-4, results=0, recompute_D_X_f = False):
[1m    iFair.iters += 1
[0m    [1m^[0m[0m
[0m
[0m[1m[1] During: typing of get attribute at C:\Users\malu.maia\Desktop\iFair-master\iFair_impl\lowrank_helpers.py (98)[0m
[1m
File "iFair_impl\lowrank_helpers.py", line 98:[0m
[1mdef iFair(params, X, D_X_f=0, k=10, A_x=1e-4, A_z=1e-4, results=0, recompute_D_X_f = False):
[1m    iFair.iters += 1
[0m    [1m^[0m[0m
[0m
  @jit
[1m
File "iFair_impl\lowrank_helpers.py", line 97:[0m
[1m@jit
[1mdef iFair(params, X, D_X_f=0, k=10, A_x=1e-4, A_z=1e-4, results=0, recompute_D_X_f = False):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
Fall-back from

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


In [7]:
def consistency(X, y, n_neighbors=5):
    X, y = check_X_y(X, y)
    neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors.fit(X)
    idx = neighbors.kneighbors(X, return_distance=False)
    return 1 - abs(y - y[idx].mean(axis=1)).mean()

In [8]:
def auc_func(fair_Xs_train, ys_train, Xs_test, ys_test):
    aucs = []
    clfs = []
    consistencies = []
    for i in range(len(fair_Xs_train)):
        X_train, y_train = fair_Xs_train[i], np.array(list(map(float, ys_train[i])))
        X_test, y_test = Xs_test[i], ys_test[i]
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)
        y_hat = clf.predict(X_test)
        aucs.append(roc_auc_score(y_test, y_hat))
        consistencies.append(consistency(X_test, y_hat))
        clfs.append(clf)
    return aucs, clfs, consistencies

In [9]:
aucs, clfs_compas, consis_compas = auc_func(fair_Xs_train ,ys_train, Xs_test, ys_test)
np.mean(aucs), np.mean(consis_compas)

(0.5445581833625311, 0.903111111111111)

Running German samples

In [10]:
german_samples = ['samples/german_{}'.format(i) for i in range(15)]
german_fair_Xs_train, german_ys_train, german_Xs_test, german_ys_test = [], [], [], []

for sample in german_samples:
    german = pd.read_csv(sample)
    del german['Unnamed: 0']

    german['classification'].replace({'Bad': 0,
                                      'Good': 1}, inplace=True)

    german['sex'].replace({'Female': 0,
                           'Male': 1}, inplace=True)

    X_german = german[german.columns]
    y_german = german[german.columns[-1]]

    protected_attr = X_compas.pop('race')
    X_compas.insert(len(X_compas.columns)-1, protected_attr.name, protected_attr)

    oh_german = one_hot_encoder(X_german)
    oh_german = normalize_data(oh_german)
    X_train, X_test, y_train, y_test = train_test_split(oh_german[oh_german.columns[:-1]], oh_german['classification'], train_size=0.7, stratify=oh_german['classification'])
    X_train = np.array(X_train)

    fair = iFair()
    german_fair_Xs_train.append(fair.fit_transform(X_train))
    german_ys_train.append(np.array(y_train))
    german_Xs_test.append(X_test)
    german_ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanmean(X, axis=0)
  col_mean = np.nanm

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


In [11]:
german_aucs, german_clfs, german_consis = auc_func(german_fair_Xs_train, german_ys_train, german_Xs_test, german_ys_test)
np.mean(german_aucs), np.mean(german_consis)

(0.48435883021123305, 0.8515555555555554)

In [12]:
adult_samples = ['samples/adult_{}'.format(i) for i in range(15)]
adult_fair_Xs_train, adult_ys_train, adult_Xs_test, adult_ys_test = [], [], [], []

for sample in adult_samples:
    adult = pd.read_csv(sample)
    del adult['Unnamed: 0']

    adult['income'].replace({'<=50K': 0,
                             '>50K': 1}, inplace=True)

    adult['gender'].replace({'Female': 0,
                           'Male': 1}, inplace=True)

    X_adult = adult[adult.columns]
    y_adult = adult[adult.columns[-1]]

    protected_attr = X_adult.pop('race')
    X_adult.insert(len(X_adult.columns)-1, protected_attr.name, protected_attr)

    oh_adult = one_hot_encoder(X_adult)
    oh_adult = normalize_data(oh_adult)
    X_train, X_test, y_train, y_test = train_test_split(oh_adult[oh_adult.columns[:-1]], oh_adult['income'], train_size=0.7, stratify=oh_adult['income'])
    X_train = np.array(X_train)

    fair = iFair()
    adult_fair_Xs_train.append(fair.fit_transform(X_train))
    adult_ys_train.append(np.array(y_train))
    adult_Xs_test.append(X_test)
    adult_ys_test.append(y_test)

Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...
Fitting and transforming...
Fitting iFair...


In [13]:
adult_aucs, adult_clfs, adult_consis = auc_func(adult_fair_Xs_train, adult_ys_train, adult_Xs_test, adult_ys_test)
np.mean(adult_aucs), np.mean(adult_consis)

(0.46017849300457997, 0.9026666666666666)

In [124]:
auc_means = {'German': np.mean(german_aucs),
             'Adult': np.mean(adult_aucs),
             'COMPAS': np.mean(aucs)}

cons_means = {'German': np.mean(german_consis),
              'Adult': np.mean(adult_consis),
              'COMPAS': np.mean(consis_compas)}

In [125]:
auc_means, cons_means

({'German': 0.0, 'Adult': 0.20666666666666664, 'COMPAS': 0.2},
 {'German': 0.48435883021123305,
  'Adult': 0.46017849300457997,
  'COMPAS': 0.5445581833625311},
 {'German': 0.8515555555555554,
  'Adult': 0.9026666666666666,
  'COMPAS': 0.903111111111111})