In [78]:
import openml, fairlib
import fairlib as fl
from fairlib.inprocessing import Fauci
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn

In [79]:
dataset = openml.datasets.get_dataset(179)
X, y, _, names = dataset.get_data(target=dataset.default_target_attribute)

INFO:openml.datasets.dataset:pickle write adult


In [80]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

In [81]:
X_discretized = X_imputed.copy()
for col in X.columns:
    if X[col].dtype == 'category':
        le = LabelEncoder()
        X_discretized[:, X.columns.get_loc(col)] = le.fit_transform(X_discretized[:, X.columns.get_loc(col)])


In [82]:
X = fairlib.DataFrame(X_discretized, columns=names)
X = X.drop(columns=["fnlwgt"])
y = y.apply(lambda x: x == ">50K").astype(int)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=41)

In [84]:
X_train['income'] = y_train

In [85]:
X_train

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,income
7567,0,3,0,6,4,2,4,4,1,0,0,2,38,0
3749,4,3,12,14,2,0,0,4,1,0,0,4,38,1
7496,0,3,11,9,0,13,1,4,1,0,0,2,38,0
36662,0,3,15,10,4,7,3,4,0,0,0,1,38,0
7701,4,3,12,14,2,3,0,4,1,0,0,3,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48597,2,3,15,10,2,0,0,4,1,0,0,3,38,0
41281,2,5,15,10,2,11,0,4,1,0,0,3,38,0
20450,2,5,11,9,2,2,0,2,1,0,0,1,22,0
931,0,3,15,10,4,9,3,4,0,0,0,0,38,0


In [86]:
fauci_train_dataset = fl.DataFrame(X_train)
X_train.drop(columns=["income"], inplace=True)
num_features = X_train.shape[1]

In [87]:
fauci_train_dataset.targets = "income"
fauci_train_dataset.sensitive = 'sex' # fauci currently supports only one sensitive attribute

In [88]:
def create_model(input_shape):
    return nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.LeakyReLU(),
        nn.Linear(64, 32),
        nn.LeakyReLU(),
        nn.Linear(32, 16),
        nn.LeakyReLU(),
        nn.Linear(16, 8),
        nn.LeakyReLU(),
        nn.Linear(8, 1),
        nn.Sigmoid()
    )

In [117]:
unprocessed = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization=None, regularization_weight=0.0)

In [118]:
inprocessing_spd = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization='spd', regularization_weight=0.3)

In [119]:
inprocessing_di = Fauci(create_model(num_features), loss=nn.BCELoss(), fairness_regularization='di', regularization_weight=0.3)

In [120]:
EPOCHS = 25
BATCH_SIZE = 120

In [121]:
unprocessed.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/25], Loss: 0.4529
Epoch [2/25], Loss: 0.3849
Epoch [3/25], Loss: 0.3617
Epoch [4/25], Loss: 0.3498
Epoch [5/25], Loss: 0.3429
Epoch [6/25], Loss: 0.3392
Epoch [7/25], Loss: 0.3397
Epoch [8/25], Loss: 0.3365
Epoch [9/25], Loss: 0.3345
Epoch [10/25], Loss: 0.3378
Epoch [11/25], Loss: 0.3332
Epoch [12/25], Loss: 0.3323
Epoch [13/25], Loss: 0.3330
Epoch [14/25], Loss: 0.3291
Epoch [15/25], Loss: 0.3285
Epoch [16/25], Loss: 0.3269
Epoch [17/25], Loss: 0.3269
Epoch [18/25], Loss: 0.3262
Epoch [19/25], Loss: 0.3257
Epoch [20/25], Loss: 0.3237
Epoch [21/25], Loss: 0.3236
Epoch [22/25], Loss: 0.3226
Epoch [23/25], Loss: 0.3220
Epoch [24/25], Loss: 0.3212
Epoch [25/25], Loss: 0.3223


<fairlib.inprocessing.fauci.Fauci at 0x174a6e180>

In [122]:
inprocessing_spd.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/25], Loss: 0.3329
Epoch [2/25], Loss: 0.2764
Epoch [3/25], Loss: 0.2686
Epoch [4/25], Loss: 0.2576
Epoch [5/25], Loss: 0.2449
Epoch [6/25], Loss: 0.2398
Epoch [7/25], Loss: 0.2394
Epoch [8/25], Loss: 0.2365
Epoch [9/25], Loss: 0.2353
Epoch [10/25], Loss: 0.2356
Epoch [11/25], Loss: 0.2343
Epoch [12/25], Loss: 0.2331
Epoch [13/25], Loss: 0.2320
Epoch [14/25], Loss: 0.2309
Epoch [15/25], Loss: 0.2307
Epoch [16/25], Loss: 0.2310
Epoch [17/25], Loss: 0.2287
Epoch [18/25], Loss: 0.2288
Epoch [19/25], Loss: 0.2282
Epoch [20/25], Loss: 0.2274
Epoch [21/25], Loss: 0.2268
Epoch [22/25], Loss: 0.2265
Epoch [23/25], Loss: 0.2279
Epoch [24/25], Loss: 0.2257
Epoch [25/25], Loss: 0.2250


<fairlib.inprocessing.fauci.Fauci at 0x174acdc40>

In [123]:
inprocessing_di.fit(fauci_train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [1/25], Loss: 0.6281
Epoch [2/25], Loss: 0.5765
Epoch [3/25], Loss: 0.5665
Epoch [4/25], Loss: 0.5553
Epoch [5/25], Loss: 0.5456
Epoch [6/25], Loss: 0.5403
Epoch [7/25], Loss: 0.5372
Epoch [8/25], Loss: 0.5378
Epoch [9/25], Loss: 0.5350
Epoch [10/25], Loss: 0.5340
Epoch [11/25], Loss: 0.5311
Epoch [12/25], Loss: 0.5317
Epoch [13/25], Loss: 0.5308
Epoch [14/25], Loss: 0.5294
Epoch [15/25], Loss: 0.5297
Epoch [16/25], Loss: 0.5289
Epoch [17/25], Loss: 0.5281
Epoch [18/25], Loss: 0.5283
Epoch [19/25], Loss: 0.5287
Epoch [20/25], Loss: 0.5264
Epoch [21/25], Loss: 0.5263
Epoch [22/25], Loss: 0.5260
Epoch [23/25], Loss: 0.5254
Epoch [24/25], Loss: 0.5247
Epoch [25/25], Loss: 0.5245


<fairlib.inprocessing.fauci.Fauci at 0x174accb90>

In [124]:
X_test_tensor = torch.tensor(X_test.to_numpy().astype(float)).float()

y_pred_unprocessed = unprocessed.predict(X_test_tensor).detach().numpy()
y_pred_spd = inprocessing_spd.predict(X_test_tensor).detach().numpy()
y_pred_di = inprocessing_di.predict(X_test_tensor).detach().numpy()

In [125]:
y_pred_unprocessed = (y_pred_unprocessed > 0.5).astype(int)
y_pred_spd = (y_pred_spd > 0.5).astype(int)
y_pred_di = (y_pred_di > 0.5).astype(int)

In [130]:
def evaluate_model(X_test, y_test, y_pred, targets='income', sensitive='sex'):
    print("Accuracy: ", accuracy_score(y_test, y_pred), "\n")
    X = X_test.copy()
    X[targets] = y_pred
    dataset = fl.DataFrame(X)
    dataset.targets = targets
    dataset.sensitive = sensitive
    print("SPD: ", dataset.statistical_parity_difference(), "\n")
    print("DI: ", dataset.disparate_impact(), "\n")

In [131]:
evaluate_model(X_test, y_test, y_pred_unprocessed)

Accuracy:  0.8473237788827143 

SPD:  {(income=0, sex=0): 0.1273392199712875, (income=0, sex=1): -0.1273392199712875, (income=1, sex=0): -0.12733921997128747, (income=1, sex=1): 0.12733921997128747} 

DI:  {(income=0, sex=0): 0.863310360394182, (income=0, sex=1): 1.158331981031024, (income=1, sex=0): 2.8615156074153156, (income=1, sex=1): 0.34946515664936634} 



In [132]:
evaluate_model(X_test, y_test, y_pred_spd)

Accuracy:  0.8506580871599883 

SPD:  {(income=0, sex=0): 0.14901444417129373, (income=0, sex=1): -0.14901444417129373, (income=1, sex=0): -0.14901444417129373, (income=1, sex=1): 0.14901444417129373} 

DI:  {(income=0, sex=0): 0.8391608130657321, (income=0, sex=1): 1.191666703723532, (income=1, sex=0): 3.02688231975918, (income=1, sex=1): 0.330372936361649} 



In [133]:
evaluate_model(X_test, y_test, y_pred_di)

Accuracy:  0.8491956712489032 

SPD:  {(income=0, sex=0): 0.13853052948996702, (income=0, sex=1): -0.13853052948996702, (income=1, sex=0): -0.13853052948996697, (income=1, sex=1): 0.13853052948996697} 

DI:  {(income=0, sex=0): 0.8446222734294853, (income=0, sex=1): 1.1839611995307944, (income=1, sex=0): 2.2776344118164107, (income=1, sex=1): 0.4390520246849015} 

