In [1]:
import openml, fairlib
import fairlib as fl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from keras.models import Sequential
from keras.layers import Dense
from fairlib.inprocessing import Fauci

INFO:fairlib:Using Keras backend: TENSORFLOW
INFO:fairlib:fairlib loaded


In [2]:
dataset = openml.datasets.get_dataset(179)
X, y, _, names = dataset.get_data(target=dataset.default_target_attribute)

INFO:openml.datasets.dataset:pickle write adult


In [3]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

In [4]:
X_discretized = X_imputed.copy()
for col in X.columns:
    if X[col].dtype == 'category':
        le = LabelEncoder()
        X_discretized[:, X.columns.get_loc(col)] = le.fit_transform(X_discretized[:, X.columns.get_loc(col)])


In [5]:
X = fairlib.DataFrame(X_discretized, columns=names)
y = y.apply(lambda x: x == ">50K").astype(int)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
X_train['income'] = y_train

In [8]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,income
5551,3,1,277024.0,11,9,5,10,1,2,1,0,0,2,38,0
36721,4,4,164616.0,10,16,2,9,0,4,1,4,0,2,38,1
2638,3,3,159755.0,8,11,2,9,0,4,1,0,0,3,38,1
36214,3,3,260052.0,9,13,0,3,4,4,1,4,0,2,38,1
27010,2,4,188615.0,9,13,2,11,0,4,1,0,0,3,38,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,3,200117.0,9,13,2,3,0,1,1,0,2,3,38,1
44732,0,3,90896.0,11,9,4,6,3,4,0,0,0,2,38,0
38158,0,3,370057.0,11,9,0,0,1,4,0,0,0,2,38,0
860,0,3,216284.0,1,7,4,0,3,4,0,0,0,0,38,0


In [9]:
fauci_train_dataset = fl.DataFrame(X_train)

In [10]:
fauci_train_dataset.targets = "income"
fauci_train_dataset.sensitive = 'sex' # fauci currently supports only one sensitive attribute

In [11]:
model = Sequential()
model.add(Dense(32, activation='relu')) 
model.add(Dense(16, activation='relu'))  # First hidden layer
model.add(Dense(8, activation='relu'))  # Second hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for classification

In [12]:
inprocessing_spd = Fauci(model, loss='binary_crossentropy', regularizer='sp', optimizer='adam', metrics=['accuracy'])
inprocessing_di = Fauci(model, loss='binary_crossentropy', regularizer='di', optimizer='adam', metrics=['accuracy'])

In [13]:
inprocessing_spd.fit(fauci_train_dataset, converting_to_type=float, epochs=10, batch_size=32, validation_split=0.3)
inprocessing_di.fit(fauci_train_dataset, converting_to_type=float, epochs=10, batch_size=32, validation_split=0.3)

Epoch 1/10
RICHIAMO LA SPD
RICHIAMO LA SPD
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6519 - loss: 631.7284RICHIAMO LA SPD
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6519 - loss: 631.1598 - val_accuracy: 0.2399 - val_loss: 82.4529
Epoch 2/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6414 - loss: 95.8172 - val_accuracy: 0.7617 - val_loss: 4.3290
Epoch 3/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6431 - loss: 53.8232 - val_accuracy: 0.7601 - val_loss: 88.9610
Epoch 4/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6388 - loss: 45.9729 - val_accuracy: 0.7615 - val_loss: 11.5125
Epoch 5/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6456 - loss: 53.1653 - val_accuracy: 0.2455 - val_loss: 11.0093
Epoch 6/10
[1m716/716

<keras.src.callbacks.history.History at 0x17fa54320>

In [14]:
y_pred_spd = inprocessing_spd.predict(X_test.astype(float))
y_pred_di = inprocessing_di.predict(X_test.astype(float))

[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275us/step
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247us/step


In [15]:
print(y_pred_spd.sum())
print(y_pred_di.sum())

16086.549
16086.549


In [16]:
y_pred_spd = (y_pred_spd > 0.5).astype(int)
y_pred_di = (y_pred_di > 0.5).astype(int)

In [17]:
from sklearn.metrics import accuracy_score
print("accuracy with spd regularization : ", accuracy_score(y_test, y_pred_spd))
print("accuracy with di regularization : ", accuracy_score(y_test, y_pred_di))

accuracy with spd regularization :  0.24413698970095546
accuracy with di regularization :  0.24413698970095546


In [18]:
X_test['income'] = y_pred_spd

In [19]:
spd_dataset = fl.DataFrame(X_test)
spd_dataset.targets = "income"
spd_dataset.sensitive = 'sex'

spd_dataset.statistical_parity_difference()

(income=1, sex=1) -> 0.00337043980175733
(income=1, sex=0) -> -0.00337043980175733

In [20]:
X_test['income'] = y_pred_di

In [21]:
di_dataset = fl.DataFrame(X_test)
di_dataset.targets = "income"
di_dataset.sensitive = 'sex'

di_dataset.disparate_impact()

this is unprivileged in di:  0.9964439453490548
this is privileged in di:  0.9998143851508121
this is unprivileged in di:  0.9998143851508121
this is privileged in di:  0.9964439453490548


(income=1, sex=1) -> 0.9966289344784243
(income=1, sex=0) -> 1.0033824680429733

## Test With un-fair model

In [22]:
model = Sequential()
model.add(Dense(32, activation='relu'))  # Second hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for classification

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)

In [26]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.3)

Epoch 1/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 501us/step - accuracy: 0.5903 - loss: 2267.6943 - val_accuracy: 0.7601 - val_loss: 127.9464
Epoch 2/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step - accuracy: 0.6448 - loss: 61.5559 - val_accuracy: 0.7316 - val_loss: 1.3826
Epoch 3/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432us/step - accuracy: 0.6338 - loss: 67.6278 - val_accuracy: 0.7601 - val_loss: 100.5190
Epoch 4/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432us/step - accuracy: 0.6408 - loss: 71.6107 - val_accuracy: 0.7627 - val_loss: 19.8335
Epoch 5/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 433us/step - accuracy: 0.6472 - loss: 58.0876 - val_accuracy: 0.7620 - val_loss: 21.9908
Epoch 6/10
[1m716/716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426us/step - accuracy: 0.6440 - loss: 55.6261 - val_accuracy: 0.7615 - val_loss: 54.3506
E

<keras.src.callbacks.history.History at 0x17dc39ca0>

In [27]:
y_pred = model.predict(X_test.astype(float))
y_pred = (y_pred > 0.5).astype(int)

[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237us/step


In [28]:
accuracy_score(y_test, y_pred)

0.7743516565330686

In [29]:
X_test['income'] = y_pred
spd_dataset = fl.DataFrame(X_test)
spd_dataset.targets = "income"
spd_dataset.sensitive = 'sex'

spd_dataset.statistical_parity_difference()

(income=1, sex=1) -> 0.19568319543796708
(income=1, sex=0) -> -0.19568319543796708

In [30]:
spd_dataset.disparate_impact()

this is unprivileged in di:  0.061950215234886766
this is privileged in di:  0.25763341067285384
this is unprivileged in di:  0.25763341067285384
this is privileged in di:  0.061950215234886766


(income=1, sex=1) -> 0.24045877851437497
(income=1, sex=0) -> 4.158716958383861