#### This notebook demonstrates the use of adversarial debiasing algorithm to learn a fair classifier.
Adversarial debiasing [1] is an in-processing technique that learns a classifier to maximize prediction accuracy and simultaneously reduce an adversary's ability to determine the protected attribute from the predictions. This approach leads to a fair classifier as the predictions cannot carry any group discrimination information that the adversary can exploit. We will see how to use this algorithm for learning models with and without fairness constraints and apply them on the Adult dataset.

In [1]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector

from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult, load_preproc_data_compas, load_preproc_data_german

from aif360.algorithms.inprocessing.adversarial_debiasing import AdversarialDebiasing

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


In [2]:
import numpy as np

#### Load dataset and set options

In [41]:
# Get the dataset and split into train and test
dataset_orig = load_preproc_data_adult()

privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]

dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)

In [4]:
dataset_orig = GermanDataset()
privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]

In [33]:
dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)

#### Metric for original training data

### Learn plan classifier without debiasing

In [42]:
# Load post-processing algorithm that equalizes the odds
# Learn parameters with debias set to False
sess = tf.Session()
plain_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='plain_classifier',
                          debias=False,
                          sess=sess)

In [43]:
plain_model.fit(dataset_orig_train)

<aif360.algorithms.inprocessing.adversarial_debiasing.AdversarialDebiasing at 0x2980327c608>

In [44]:
# Apply the plain model to test data
dataset_nodebiasing_train = plain_model.predict(dataset_orig_train)
dataset_nodebiasing_test = plain_model.predict(dataset_orig_test)

In [9]:
# Metrics for the dataset from plain model (without debiasing)
display(Markdown("#### Plain model - without debiasing - dataset metrics"))
metric_dataset_nodebiasing_train = BinaryLabelDatasetMetric(dataset_nodebiasing_train, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)

print("Train set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_nodebiasing_train.mean_difference())

metric_dataset_nodebiasing_test = BinaryLabelDatasetMetric(dataset_nodebiasing_test, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)

print("Test set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_nodebiasing_test.mean_difference())

display(Markdown("#### Plain model - without debiasing - classification metrics"))
classified_metric_nodebiasing_test = ClassificationMetric(dataset_orig_test, 
                                                 dataset_nodebiasing_test,
                                                 unprivileged_groups=unprivileged_groups,
                                                 privileged_groups=privileged_groups)
print("Test set: Classification accuracy = %f" % classified_metric_nodebiasing_test.accuracy())
TPR = classified_metric_nodebiasing_test.true_positive_rate()
TNR = classified_metric_nodebiasing_test.true_negative_rate()
bal_acc_nodebiasing_test = 0.5*(TPR+TNR)
print("Test set: Balanced classification accuracy = %f" % bal_acc_nodebiasing_test)
print("Test set: Disparate impact = %f" % classified_metric_nodebiasing_test.disparate_impact())
print("Test set: Equal opportunity difference = %f" % classified_metric_nodebiasing_test.equal_opportunity_difference())
print("Test set: Average odds difference = %f" % classified_metric_nodebiasing_test.average_odds_difference())
print("Test set: Theil_index = %f" % classified_metric_nodebiasing_test.theil_index())

#### Plain model - without debiasing - dataset metrics

Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.206838
Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.205994


#### Plain model - without debiasing - classification metrics

Test set: Classification accuracy = 0.804886
Test set: Balanced classification accuracy = 0.659419
Test set: Disparate impact = 0.000000
Test set: Equal opportunity difference = -0.447686
Test set: Average odds difference = -0.273806
Test set: Theil_index = 0.179642


### Apply in-processing algorithm based on adversarial learning

In [50]:
sess.close()
tf.reset_default_graph()
sess = tf.Session()

In [51]:
# Learn parameters with debias set to True
debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='debiased_classifier',
                          debias=True,
                          sess=sess)

In [52]:
debiased_model.fit(dataset_orig_train)

epoch 0; iter: 0; batch classifier loss: 0.630446; batch adversarial loss: 0.700471
epoch 1; iter: 0; batch classifier loss: 0.586666; batch adversarial loss: 0.663528
epoch 2; iter: 0; batch classifier loss: 0.535873; batch adversarial loss: 0.680876
epoch 3; iter: 0; batch classifier loss: 0.629068; batch adversarial loss: 0.690611
epoch 4; iter: 0; batch classifier loss: 0.604589; batch adversarial loss: 0.641804
epoch 5; iter: 0; batch classifier loss: 0.546016; batch adversarial loss: 0.682053
epoch 6; iter: 0; batch classifier loss: 0.551893; batch adversarial loss: 0.669567
epoch 7; iter: 0; batch classifier loss: 0.515059; batch adversarial loss: 0.647458
epoch 8; iter: 0; batch classifier loss: 0.466134; batch adversarial loss: 0.730296
epoch 9; iter: 0; batch classifier loss: 0.483293; batch adversarial loss: 0.670743
epoch 10; iter: 0; batch classifier loss: 0.487542; batch adversarial loss: 0.709081
epoch 11; iter: 0; batch classifier loss: 0.525901; batch adversarial loss:

<aif360.algorithms.inprocessing.adversarial_debiasing.AdversarialDebiasing at 0x17f4393bbc8>

In [53]:
# Apply the plain model to test data
dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
dataset_debiasing_test = debiased_model.predict(dataset_orig_test)

In [69]:
# Metrics for the dataset from plain model (without debiasing)
display(Markdown("#### Plain model - without debiasing - dataset metrics"))
print("Train set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_nodebiasing_train.mean_difference())
print("Test set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_nodebiasing_test.mean_difference())

# Metrics for the dataset from model with debiasing
display(Markdown("#### Model - with debiasing - dataset metrics"))
metric_dataset_debiasing_train = BinaryLabelDatasetMetric(dataset_debiasing_train, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)

print("Train set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_debiasing_train.mean_difference())

metric_dataset_debiasing_test = BinaryLabelDatasetMetric(dataset_debiasing_test, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)

print("Test set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_dataset_debiasing_test.mean_difference())



display(Markdown("#### Plain model - without debiasing - classification metrics"))
print("Test set: Classification accuracy = %f" % classified_metric_nodebiasing_test.accuracy())
TPR = classified_metric_nodebiasing_test.true_positive_rate()
TNR = classified_metric_nodebiasing_test.true_negative_rate()
bal_acc_nodebiasing_test = 0.5*(TPR+TNR)
print("Test set: Balanced classification accuracy = %f" % bal_acc_nodebiasing_test)
print("Test set: Disparate impact = %f" % classified_metric_nodebiasing_test.disparate_impact())
print("Test set: Equal opportunity difference = %f" % classified_metric_nodebiasing_test.equal_opportunity_difference())
print("Test set: Average odds difference = %f" % classified_metric_nodebiasing_test.average_odds_difference())
print("Test set: Theil_index = %f" % classified_metric_nodebiasing_test.theil_index())



display(Markdown("#### Model - with debiasing - classification metrics"))
classified_metric_debiasing_test = ClassificationMetric(dataset_orig_test, 
                                                 dataset_debiasing_test,
                                                 unprivileged_groups=unprivileged_groups,
                                                 privileged_groups=privileged_groups)
print("Test set: Classification accuracy = %f" % classified_metric_debiasing_test.accuracy())
TPR = classified_metric_debiasing_test.true_positive_rate()
TNR = classified_metric_debiasing_test.true_negative_rate()
bal_acc_debiasing_test = 0.5*(TPR+TNR)
print("Test set: Balanced classification accuracy = %f" % bal_acc_debiasing_test)
print("Test set: Disparate impact = %f" % classified_metric_debiasing_test.disparate_impact())
print("Test set: Equal opportunity difference = %f" % classified_metric_debiasing_test.equal_opportunity_difference())
print("Test set: Average odds difference = %f" % classified_metric_debiasing_test.average_odds_difference())
print("Test set: Theil_index = %f" % classified_metric_debiasing_test.theil_index())

#### Plain model - without debiasing - dataset metrics

Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.206838
Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.205994


#### Model - with debiasing - dataset metrics

Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.357143
Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.417006


#### Plain model - without debiasing - classification metrics

Test set: Classification accuracy = 0.804886
Test set: Balanced classification accuracy = 0.659419
Test set: Disparate impact = 0.000000
Test set: Equal opportunity difference = -0.447686
Test set: Average odds difference = -0.273806
Test set: Theil_index = 0.179642


#### Model - with debiasing - classification metrics

Test set: Classification accuracy = 0.785000
Test set: Balanced classification accuracy = 0.665517
Test set: Disparate impact = 0.573444
Test set: Equal opportunity difference = -0.209897
Test set: Average odds difference = -0.471615
Test set: Theil_index = 0.096292


In [7]:
from tqdm import tqdm


    References:
    [1] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating UnwantedBiases with Adversarial Learning," 
    AAAI/ACM Conference on Artificial Intelligence, Ethics, and Society, 2018.

In [33]:
DEOO_fin = []
DPE_fin = []
mis_fin = []
sigma = 1
for step in tqdm(range(100)):
    mu_00 = 3 * np.random.rand(1)
    mu_01 = np.random.rand(1)
    mu_10 = np.random.rand(1)
    mu_11 = np.random.rand(1)
    for i in range(1000):
        if(dataset_orig.labels[i] == 2 and dataset_orig.features[:,7][i] == 0):
            #z = np.random.normal(mu_00, sigma, 57)
            #z = np.random.chisquare(2,57)
            z = np.random.standard_t(3, 57)
            for j in range(7):
                dataset_orig.features[i][j] = z[j]
            for j in range(8, 58):
                dataset_orig.features[i][j] = z[j - 1]
        elif(dataset_orig.labels[i] == 2 and dataset_orig.features[:,7][i] == 1):
            #z = np.random.normal(mu_01, sigma, 57)
            #z = np.random.standard_t(4, 57)
            z = np.random.chisquare(1,57)
            for j in range(7):
                dataset_orig.features[i][j] = z[j]
            for j in range(8, 58):
                dataset_orig.features[i][j] = z[j - 1]
        elif(dataset_orig.labels[i] == 1 and dataset_orig.features[:,7][i] == 0):
            #z = np.random.normal(mu_10, sigma, 57)\
            z = np.random.chisquare(3,57)
            for j in range(7):
                dataset_orig.features[i][j] = z[j]
            for j in range(8, 58):
                dataset_orig.features[i][j] = z[j - 1]
        else:
            z = np.random.normal(mu_11, sigma, 57)
            #z = np.random.chisquare(1,57)
            #z = np.random.laplace(mu_11, sigma, 57)
            for j in range(7):
                dataset_orig.features[i][j] = z[j]
            for j in range(8, 58):
                dataset_orig.features[i][j] = z[j - 1]
    randseed = np.random.randint(10000)
    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed = np.random.randint(0, 10000))
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed = np.random.randint(0, 10000))
    # Placeholder for predicted and transformed datasets
    dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)
    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)

    dataset_new_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    dataset_new_test_pred = dataset_orig_test.copy(deepcopy=True)

    y_test = dataset_orig_test.labels.ravel()

    sess.close()
    tf.reset_default_graph()
    sess = tf.Session()
    # Learn parameters with debias set to True
    debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='debiased_classifier',
                          debias=True,
                          sess=sess)
    debiased_model.fit(dataset_orig_train)
    dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
    dataset_debiasing_test = debiased_model.predict(dataset_orig_test)

    eq = 0
    for i in range(len(y_test)):
        if(dataset_debiasing_test.labels.ravel()[i] == y_test[i]):
            eq += 1
    mis_fin.append(eq / len(y_test))
    n_10 = 0
    n_11 = 0
    c_10 = 0
    c_11 = 0
    for i in range(len(y_test)):
        if(y_test[i] == 1 and dataset_debiasing_test.protected_attributes[:,1][i] == 0):
            n_10 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_10 += 1
        elif(y_test[i] == 1 and dataset_debiasing_test.protected_attributes[:,1][i] == 1):
            n_11 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_11 += 1
    DEOO_fin.append(c_10 / n_10 - c_11 / n_11)
    n_00 = 0
    n_01 = 0
    c_00 = 0
    c_01 = 0
    for i in range(len(y_test)):
        if(y_test[i] == 2 and dataset_debiasing_test.protected_attributes[:,1][i] == 0):
            n_00 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_00 += 1
        elif(y_test[i] == 2 and dataset_debiasing_test.protected_attributes[:,1][i] == 1):
            n_01 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_01 += 1
    DPE_fin.append(c_00 / n_00 - c_01 / n_01)
DEOO_fin = np.array(DEOO_fin)
DPE_fin = np.array(DPE_fin)
mis_fin = np.array(mis_fin)

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


In [48]:
DEOO_fin = []
DPE_fin = []
mis_fin = []
sigma = 1
for step in tqdm(range(50)):
    randseed = np.random.randint(10000)
    dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True, seed = np.random.randint(0, 10000))
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True, seed = np.random.randint(0, 10000))
    # Placeholder for predicted and transformed datasets
    dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)
    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)

    dataset_new_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    dataset_new_test_pred = dataset_orig_test.copy(deepcopy=True)

    y_test = dataset_orig_test.labels.ravel()

    sess.close()
    tf.reset_default_graph()
    sess = tf.Session()
    # Learn parameters with debias set to True
    debiased_model = AdversarialDebiasing(privileged_groups = privileged_groups,
                          unprivileged_groups = unprivileged_groups,
                          scope_name='debiased_classifier',
                          debias=True,
                          sess=sess)
    debiased_model.fit(dataset_orig_train)
    dataset_debiasing_train = debiased_model.predict(dataset_orig_train)
    dataset_debiasing_test = debiased_model.predict(dataset_orig_test)

    eq = 0
    for i in range(len(y_test)):
        if(dataset_debiasing_test.labels.ravel()[i] == y_test[i]):
            eq += 1
    mis_fin.append(eq / len(y_test))
    n_10 = 0
    n_11 = 0
    c_10 = 0
    c_11 = 0
    for i in range(len(y_test)):
        if(y_test[i] == 1 and dataset_debiasing_test.protected_attributes[:,1][i] == 0):
            n_10 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_10 += 1
        elif(y_test[i] == 1 and dataset_debiasing_test.protected_attributes[:,1][i] == 1):
            n_11 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_11 += 1
    DEOO_fin.append(c_10 / n_10 - c_11 / n_11)
    n_00 = 0
    n_01 = 0
    c_00 = 0
    c_01 = 0
    for i in range(len(y_test)):
        if(y_test[i] == 0 and dataset_debiasing_test.protected_attributes[:,1][i] == 0):
            n_00 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_00 += 1
        elif(y_test[i] == 0 and dataset_debiasing_test.protected_attributes[:,1][i] == 1):
            n_01 += 1
            if(dataset_debiasing_test.labels.ravel()[i] == 1):
                c_01 += 1
    DPE_fin.append(c_00 / n_00 - c_01 / n_01)
DEOO_fin = np.array(DEOO_fin)
DPE_fin = np.array(DPE_fin)
mis_fin = np.array(mis_fin)

100%|██████████| 50/50 [19:55<00:00, 23.91s/it]


In [49]:
print(np.mean(abs(DEOO_fin)))
print(np.mean(abs(DPE_fin)))
print(np.mean(mis_fin))
print(np.percentile(mis_fin, 95))
print(np.percentile(abs(DEOO_fin), 95))
print(np.percentile(abs(DPE_fin), 95))

0.19856798163069647
0.07359700484280035
0.7908649810625447
0.7970774900194493
0.2475582826018601
0.09365073689999337
