In [18]:
import openml, fairlib
import fairlib as fl
from fairlib.inprocessing import Fauci
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from keras.models import Sequential
from keras.layers import Dense

from fairlib import keras
keras.utils.set_random_seed(423)


In [19]:
dataset = openml.datasets.get_dataset(179)
X, y, _, names = dataset.get_data(target=dataset.default_target_attribute)

INFO:openml.datasets.dataset:pickle write adult


In [20]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

In [21]:
X_discretized = X_imputed.copy()
for col in X.columns:
    if X[col].dtype == 'category':
        le = LabelEncoder()
        X_discretized[:, X.columns.get_loc(col)] = le.fit_transform(X_discretized[:, X.columns.get_loc(col)])


In [22]:
X = fairlib.DataFrame(X_discretized, columns=names)
y = y.apply(lambda x: x == ">50K").astype(int)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=41)

In [24]:
X_train['income'] = y_train

In [25]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,income
7567,0,3,141698.0,0,6,4,2,4,4,1,0,0,2,38,0
3749,4,3,318450.0,12,14,2,0,0,4,1,0,0,4,38,1
7496,0,3,227626.0,11,9,0,13,1,4,1,0,0,2,38,0
36662,0,3,91733.0,15,10,4,7,3,4,0,0,0,1,38,0
7701,4,3,245193.0,12,14,2,3,0,4,1,0,0,3,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48597,2,3,36423.0,15,10,2,0,0,4,1,0,0,3,38,0
41281,2,5,186934.0,15,10,2,11,0,4,1,0,0,3,38,0
20450,2,5,217826.0,11,9,2,2,0,2,1,0,0,1,22,0
931,0,3,52114.0,15,10,4,9,3,4,0,0,0,0,38,0


In [26]:
fauci_train_dataset = fl.DataFrame(X_train)
X_train.drop(columns=["income"], inplace=True)

In [27]:
fauci_train_dataset.targets = "income"
fauci_train_dataset.sensitive = 'sex' # fauci currently supports only one sensitive attribute

In [28]:
def create_model():
    model = Sequential()
    model.add(Dense(32, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(6, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [29]:
unwrapped = create_model()
unwrapped.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
unprocessed = Fauci(create_model(), loss='binary_crossentropy', regularizer=None, optimizer='adam', metrics=['accuracy'], regularization_weight=0.0)

In [31]:
inprocessing_spd = Fauci(create_model(), loss='binary_crossentropy', regularizer='sp', optimizer='adam', metrics=['accuracy'])

In [32]:
inprocessing_di = Fauci(create_model(), loss='binary_crossentropy', regularizer='di', optimizer='adam', metrics=['accuracy'])

In [33]:
EPOCHS = 20
BATCH_SIZE = 20
VALIDATION_SPLIT=0.3

In [34]:
unwrapped.fit(X_train.astype(float), y_train.astype(float), epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)

Epoch 1/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 592us/step - accuracy: 0.6279 - loss: 610.7573 - val_accuracy: 0.7633 - val_loss: 11.4900
Epoch 2/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 505us/step - accuracy: 0.6421 - loss: 81.6073 - val_accuracy: 0.7624 - val_loss: 62.4201
Epoch 3/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 465us/step - accuracy: 0.6357 - loss: 57.6417 - val_accuracy: 0.2376 - val_loss: 27.3015
Epoch 4/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 466us/step - accuracy: 0.6328 - loss: 54.4031 - val_accuracy: 0.2377 - val_loss: 19.5554
Epoch 5/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 468us/step - accuracy: 0.6369 - loss: 54.5804 - val_accuracy: 0.2376 - val_loss: 78.9801
Epoch 6/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 467us/step - accuracy: 0.6437 - loss: 38.4397 - val_accuracy: 0.7631 - val_loss:

<keras.src.callbacks.history.History at 0x1664e8860>

In [35]:
unprocessed.fit(fauci_train_dataset, converting_to_type=float, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)

Epoch 1/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 514us/step - accuracy: 0.6371 - loss: 169.4187 - val_accuracy: 0.2384 - val_loss: 15.6597
Epoch 2/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 461us/step - accuracy: 0.6386 - loss: 49.8933 - val_accuracy: 0.2377 - val_loss: 17.1107
Epoch 3/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 461us/step - accuracy: 0.6374 - loss: 39.7904 - val_accuracy: 0.2377 - val_loss: 17.4198
Epoch 4/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 474us/step - accuracy: 0.6359 - loss: 26.7904 - val_accuracy: 0.7624 - val_loss: 30.9669
Epoch 5/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 510us/step - accuracy: 0.6433 - loss: 27.5988 - val_accuracy: 0.7624 - val_loss: 33.3125
Epoch 6/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 487us/step - accuracy: 0.6430 - loss: 23.7227 - val_accuracy: 0.7666 - val_loss:

<keras.src.callbacks.history.History at 0x166623380>

In [36]:
inprocessing_spd.fit(fauci_train_dataset, converting_to_type=float, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)

Epoch 1/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6383 - loss: 43.8218 - val_accuracy: 0.7625 - val_loss: 6.9632
Epoch 2/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6368 - loss: 30.4221 - val_accuracy: 0.2376 - val_loss: 58.6720
Epoch 3/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6342 - loss: 25.0636 - val_accuracy: 0.7586 - val_loss: 2.1601
Epoch 4/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6368 - loss: 18.0474 - val_accuracy: 0.7646 - val_loss: 2.9687
Epoch 5/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6465 - loss: 16.8739 - val_accuracy: 0.7628 - val_loss: 13.2569
Epoch 6/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6402 - loss: 14.3061 - val_accuracy: 0.7665 - val_loss: 4.8530
Epoch 7/

<keras.src.callbacks.history.History at 0x167468770>

In [37]:
inprocessing_di.fit(fauci_train_dataset, converting_to_type=float, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT)

Epoch 1/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6149 - loss: 428.6197 - val_accuracy: 0.2393 - val_loss: 4.3902
Epoch 2/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6311 - loss: 57.6394 - val_accuracy: 0.2376 - val_loss: 69.8339
Epoch 3/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6376 - loss: 57.9088 - val_accuracy: 0.7624 - val_loss: 37.5443
Epoch 4/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6373 - loss: 44.4796 - val_accuracy: 0.2376 - val_loss: 119.5534
Epoch 5/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6403 - loss: 45.6724 - val_accuracy: 0.7637 - val_loss: 29.9249
Epoch 6/20
[1m1112/1112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6435 - loss: 31.9301 - val_accuracy: 0.2376 - val_loss: 73.7830
Epo

<keras.src.callbacks.history.History at 0x1674e6000>

In [38]:
y_pred_unwrapped = unwrapped.predict(X_test.astype(float))
y_pred_unprocessed = unprocessed.predict(X_test.astype(float))
y_pred_spd = inprocessing_spd.predict(X_test.astype(float))
y_pred_di = inprocessing_di.predict(X_test.astype(float))

[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283us/step
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263us/step
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265us/step
[1m535/535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260us/step


In [39]:
y_pred_unwrapped = (y_pred_unwrapped > 0.5).astype(int)
y_pred_unprocessed = (y_pred_unprocessed > 0.5).astype(int)
y_pred_spd = (y_pred_spd > 0.5).astype(int)
y_pred_di = (y_pred_di > 0.5).astype(int)

In [40]:
def evaluate_model(X_test, y_test, y_pred, targets='income', sensitive='sex'):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    X = X_test.copy()
    X[targets] = y_pred
    dataset = fl.DataFrame(X)
    dataset.targets = targets
    dataset.sensitive = sensitive
    print("SPD: ", dataset.statistical_parity_difference())
    print("DI: ", dataset.disparate_impact())

In [41]:
evaluate_model(X_test, y_test, y_pred_unwrapped)

Accuracy:  0.7776542848786194
SPD:  {(income=1, sex=1): np.float64(0.03794840772958623), (income=1, sex=0): np.float64(-0.03794840772958623)}
DI:  {(income=1, sex=1): np.float64(0.146683737214442), (income=1, sex=0): np.float64(6.817388341681434)}


In [42]:
evaluate_model(X_test, y_test, y_pred_unprocessed)

Accuracy:  0.7640245685873063
SPD:  {(income=0, sex=1): np.float64(0.0), (income=0, sex=0): np.float64(0.0)}
DI:  {(income=0, sex=1): np.float64(nan), (income=0, sex=0): np.float64(nan)}


  di = unprivileged_rate / privileged_rate


In [43]:
evaluate_model(X_test, y_test, y_pred_spd)

Accuracy:  0.7640245685873063
SPD:  {(income=0, sex=1): np.float64(0.0), (income=0, sex=0): np.float64(0.0)}
DI:  {(income=0, sex=1): np.float64(nan), (income=0, sex=0): np.float64(nan)}


  di = unprivileged_rate / privileged_rate


In [44]:
evaluate_model(X_test, y_test, y_pred_di)

Accuracy:  0.7656039777712782
SPD:  {(income=1, sex=1): np.float64(0.0022748903030989243), (income=1, sex=0): np.float64(-0.0022748903030989243)}
DI:  {(income=1, sex=1): np.float64(0.0719260024178924), (income=1, sex=0): np.float64(13.903177799177099)}
