In [54]:
import openml, fairlib
import fairlib as fl
from fairlib.inprocessing import Fauci
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn


In [55]:
dataset = openml.datasets.get_dataset(179)
X, y, _, names = dataset.get_data(target=dataset.default_target_attribute)

INFO:openml.datasets.dataset:pickle write adult


In [56]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

In [57]:
X_discretized = X_imputed.copy()
for col in X.columns:
    if X[col].dtype == 'category':
        le = LabelEncoder()
        X_discretized[:, X.columns.get_loc(col)] = le.fit_transform(X_discretized[:, X.columns.get_loc(col)])


In [58]:
X = fairlib.DataFrame(X_discretized, columns=names)
y = y.apply(lambda x: x == ">50K").astype(int)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=41)

In [60]:
X_train['income'] = y_train

In [61]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,income
7567,0,3,141698.0,0,6,4,2,4,4,1,0,0,2,38,0
3749,4,3,318450.0,12,14,2,0,0,4,1,0,0,4,38,1
7496,0,3,227626.0,11,9,0,13,1,4,1,0,0,2,38,0
36662,0,3,91733.0,15,10,4,7,3,4,0,0,0,1,38,0
7701,4,3,245193.0,12,14,2,3,0,4,1,0,0,3,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48597,2,3,36423.0,15,10,2,0,0,4,1,0,0,3,38,0
41281,2,5,186934.0,15,10,2,11,0,4,1,0,0,3,38,0
20450,2,5,217826.0,11,9,2,2,0,2,1,0,0,1,22,0
931,0,3,52114.0,15,10,4,9,3,4,0,0,0,0,38,0


In [62]:
fauci_train_dataset = fl.DataFrame(X_train)
X_train.drop(columns=["income"], inplace=True)
num_features = X_train.shape[1]

In [63]:
fauci_train_dataset.targets = "income"
fauci_train_dataset.sensitive = 'sex' # fauci currently supports only one sensitive attribute

In [64]:
def create_model(input_shape):
    return nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU(),
        nn.Linear(16, 8),
        nn.ReLU(),
        nn.Linear(8, 1),
        nn.Sigmoid()
    )

In [65]:
unprocessed = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization=None, regularization_weight=0.0)

In [66]:
inprocessing_spd = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization='spd', regularization_weight=0.5)

In [67]:
inprocessing_di = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization='di', regularization_weight=0.5)

In [68]:
EPOCHS = 20
BATCH_SIZE = 200
VALIDATION_SPLIT=0.3

In [69]:
unprocessed.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/20], Loss: 1.2590
Epoch [2/20], Loss: 0.6306
Epoch [3/20], Loss: 0.6076
Epoch [4/20], Loss: 0.5906
Epoch [5/20], Loss: 0.5782
Epoch [6/20], Loss: 0.5692
Epoch [7/20], Loss: 0.5631
Epoch [8/20], Loss: 0.5589
Epoch [9/20], Loss: 0.5563
Epoch [10/20], Loss: 0.5546
Epoch [11/20], Loss: 0.5535
Epoch [12/20], Loss: 0.5530
Epoch [13/20], Loss: 0.5526
Epoch [14/20], Loss: 0.5525
Epoch [15/20], Loss: 0.5523
Epoch [16/20], Loss: 0.5523
Epoch [17/20], Loss: 0.5524
Epoch [18/20], Loss: 0.5523
Epoch [19/20], Loss: 0.5524
Epoch [20/20], Loss: 0.5524


<fairlib.inprocessing.fauci.Fauci at 0x16aa74e30>

In [70]:
inprocessing_spd.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/20], Loss: 37.9460
Epoch [2/20], Loss: 37.9488
Epoch [3/20], Loss: 37.9420
Epoch [4/20], Loss: 37.9449
Epoch [5/20], Loss: 37.9454
Epoch [6/20], Loss: 37.9517
Epoch [7/20], Loss: 37.9488
Epoch [8/20], Loss: 37.9488
Epoch [9/20], Loss: 37.9443
Epoch [10/20], Loss: 37.9437
Epoch [11/20], Loss: 37.9522
Epoch [12/20], Loss: 37.9466
Epoch [13/20], Loss: 37.9471
Epoch [14/20], Loss: 37.9466
Epoch [15/20], Loss: 37.9500
Epoch [16/20], Loss: 37.9454
Epoch [17/20], Loss: 37.9477
Epoch [18/20], Loss: 37.9494
Epoch [19/20], Loss: 37.9437
Epoch [20/20], Loss: 37.9398


<fairlib.inprocessing.fauci.Fauci at 0x300ed9910>

In [71]:
inprocessing_di.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/20], Loss: 12.1601
Epoch [2/20], Loss: 12.0459
Epoch [3/20], Loss: 12.0468
Epoch [4/20], Loss: 12.0493
Epoch [5/20], Loss: 12.0431
Epoch [6/20], Loss: 12.0454
Epoch [7/20], Loss: 12.0510
Epoch [8/20], Loss: 12.0459
Epoch [9/20], Loss: 12.0465
Epoch [10/20], Loss: 12.0442
Epoch [11/20], Loss: 12.0505
Epoch [12/20], Loss: 12.0442
Epoch [13/20], Loss: 12.0442
Epoch [14/20], Loss: 12.0437
Epoch [15/20], Loss: 12.0471
Epoch [16/20], Loss: 12.0471
Epoch [17/20], Loss: 12.0425
Epoch [18/20], Loss: 12.0476
Epoch [19/20], Loss: 12.0510
Epoch [20/20], Loss: 12.0493


<fairlib.inprocessing.fauci.Fauci at 0x17d41c5f0>

In [72]:
X_test_tensor = torch.tensor(X_test.to_numpy().astype(float)).float()

y_pred_unprocessed = unprocessed.predict(X_test_tensor).detach().numpy()
y_pred_spd = inprocessing_spd.predict(X_test_tensor).detach().numpy()
y_pred_di = inprocessing_di.predict(X_test_tensor).detach().numpy()

In [73]:
y_pred_unprocessed = (y_pred_unprocessed > 0.5).astype(int)
y_pred_spd = (y_pred_spd > 0.5).astype(int)
y_pred_di = (y_pred_di > 0.5).astype(int)

In [74]:
def evaluate_model(X_test, y_test, y_pred, targets='income', sensitive='sex'):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    X = X_test.copy()
    X[targets] = y_pred
    dataset = fl.DataFrame(X)
    dataset.targets = targets
    dataset.sensitive = sensitive
    print("SPD: ", dataset.statistical_parity_difference())
    print("DI: ", dataset.disparate_impact())

In [75]:
evaluate_model(X_test, y_test, y_pred_unprocessed)

Accuracy:  0.7640245685873063
SPD:  {(income=0, sex=0): 0.0, (income=0, sex=1): 0.0}
DI:  {(income=0, sex=0): 1.0, (income=0, sex=1): 1.0}


In [76]:
evaluate_model(X_test, y_test, y_pred_spd)

Accuracy:  0.23597543141269378
SPD:  {(income=1, sex=0): 0.0, (income=1, sex=1): 0.0}
DI:  {(income=1, sex=0): 1.0, (income=1, sex=1): 1.0}


In [77]:
evaluate_model(X_test, y_test, y_pred_di)

Accuracy:  0.7640245685873063
SPD:  {(income=0, sex=0): 0.0, (income=0, sex=1): 0.0}
DI:  {(income=0, sex=0): 1.0, (income=0, sex=1): 1.0}
