In [4]:
import sys
import numpy as np
import tensorflow as tf


class AdversarialDebiasingMulti():

    def __init__(self,
                 protected_attribute_name,
                 num_labels,
                 scope_name,
                 sess,
                 seed=None,
                 adversary_loss_weight=0.1,
                 num_epochs=50,
                 batch_size=128,
                 classifier_num_hidden_units_1=100,
                 classifier_num_hidden_units_2=100,
                 adversary_num_hidden_units=100,
                 debias=True,
                 verbose=True,
                 fairness_def='parity',
                 saved_model=None):

        self.scope_name = scope_name
        self.seed = seed

        self.protected_attribute_name = protected_attribute_name
        self.num_labels = num_labels

        self.sess = sess
        self.adversary_loss_weight = adversary_loss_weight
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.classifier_num_hidden_units_1 = classifier_num_hidden_units_1
        self.classifier_num_hidden_units_2 = classifier_num_hidden_units_2
        self.adversary_num_hidden_units = adversary_num_hidden_units
        self.debias = debias
        self.verbose = verbose
        assert fairness_def in ['parity', 'equal_odds'], \
            "fairness_def must be one of: 'parity', 'equal_odds'"
        self.fairness_def = fairness_def

        self.features_dim = None
        self.features_ph = None
        self.protected_attributes_ph = None
        self.true_labels_ph = None
        self.pred_labels = None

        self.label_translate = {}

        self.saved_model = saved_model

    def _classifier_model(self, features, features_dim, keep_prob):
        """Compute the classifier predictions for the outcome variable.
        """

        with tf.compat.v1.variable_scope("classifier_model"):
            W1 = tf.compat.v1.get_variable('W1', [features_dim, self.classifier_num_hidden_units_1],
                                  initializer=tf.keras.initializers.GlorotUniform(seed=self.seed1))
            b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units_1]), name='b1')

            h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
            h1 = tf.nn.dropout(h1, rate=1-keep_prob, seed=self.seed2)

            # BEGIN NEW

            W3 = tf.compat.v1.get_variable('W3', [self.classifier_num_hidden_units_1, self.classifier_num_hidden_units_2],
                                  initializer=tf.keras.initializers.GlorotUniform(seed=self.seed5))
            b3 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units_2]), name='b3')

            h2 = tf.nn.relu(tf.matmul(h1, W3) + b3)
            h2 = tf.nn.dropout(h2, rate=1-keep_prob, seed=self.seed6)

            # END NEW

            W2 = tf.compat.v1.get_variable('W2', [self.classifier_num_hidden_units_2, self.num_labels],
                                 initializer=tf.keras.initializers.GlorotUniform(seed=self.seed3))
            b2 = tf.Variable(tf.zeros(shape=[self.num_labels]), name='b2')

            pred_logit = tf.matmul(h2, W2) + b2
            pred_label = tf.nn.softmax(pred_logit)

        return pred_label, pred_logit

    def _adversary_model(self, pred_logits, true_labels, keep_prob):
        """Compute the adversary predictions for the protected attribute.
        """

        with tf.compat.v1.variable_scope("adversary_model"):
            if self.fairness_def == 'parity':
                W2 = tf.compat.v1.get_variable('W2', [self.num_labels, self.adversary_num_hidden_units],
                                     initializer=tf.keras.initializers.GlorotUniform(seed=self.seed4))
            elif self.fairness_def == 'equal_odds':
                W2 = tf.compat.v1.get_variable('W2', [self.num_labels*2, self.adversary_num_hidden_units],
                                     initializer=tf.keras.initializers.GlorotUniform(seed=self.seed4))

            b2 = tf.Variable(tf.zeros(shape=[self.adversary_num_hidden_units]), name='b2')

            if self.fairness_def == 'parity':
                h1 = tf.nn.relu(tf.matmul(pred_logits, W2) + b2)
            elif self.fairness_def == 'equal_odds':
                h1 = tf.nn.relu(tf.matmul(tf.concat([pred_logits, true_labels], axis=1), W2) + b2)
            h1 = tf.nn.dropout(h1, rate=1-keep_prob, seed=self.seed7)

            W3 = tf.compat.v1.get_variable('W3', [self.adversary_num_hidden_units, 1],
                                 initializer=tf.keras.initializers.GlorotUniform(seed=self.seed8))
            b3 = tf.Variable(tf.zeros(shape=[1]), name='b3')


            pred_protected_attribute_logit = tf.matmul(h1, W3) + b3
            pred_protected_attribute_label = tf.sigmoid(pred_protected_attribute_logit)

        return pred_protected_attribute_label, pred_protected_attribute_logit

    def fit(self, features_set, metadata_set):
        """Compute the model parameters of the fair classifier using gradient
        descent.
        """

        if self.seed is not None:
            np.random.seed(self.seed)
        ii32 = np.iinfo(np.int32)
        self.seed1, self.seed2, self.seed3, self.seed4, self.seed5, self.seed6, self.seed7, self.seed8 = np.random.randint(ii32.min, ii32.max, size=8)

        # Map the dataset labels to one-hot
        def one_hot(x):
            return np.eye(self.num_labels)[x]
        temp_labels = metadata_set.copy()
        label_names = sorted(temp_labels.label.unique())
        for label_int in range(len(label_names)):
            label_name = label_names[label_int]
            self.label_translate[label_int] = label_name
            temp_labels.loc[(temp_labels.label == label_name), 'label'] = label_int
        temp_labels = np.array([one_hot(x) for x in temp_labels.label])

        with tf.compat.v1.variable_scope(self.scope_name):
            num_train_samples, self.features_dim = np.shape(features_set)

            # Setup placeholders
            self.features_ph = tf.compat.v1.placeholder(tf.float32, shape=[None, self.features_dim])
            self.protected_attributes_ph = tf.compat.v1.placeholder(tf.float32, shape=[None,1])
            self.true_labels_ph = tf.compat.v1.placeholder(tf.float32, shape=[None,self.num_labels])
            self.keep_prob = tf.compat.v1.placeholder(tf.float32)

            # Obtain classifier predictions and classifier loss
            self.pred_labels, pred_logits = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
            pred_labels_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.true_labels_ph, logits=pred_logits))

            if self.debias:
                # Obtain adversary predictions and adversary loss
                pred_protected_attributes_labels, pred_protected_attributes_logits = self._adversary_model(pred_logits, self.true_labels_ph, self.keep_prob)
                pred_protected_attributes_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=self.protected_attributes_ph, logits=pred_protected_attributes_logits))

            # Setup optimizers with learning rates
            global_step = tf.Variable(0, trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,
                                                                 1000, 0.96, staircase=True)
            classifier_opt = tf.compat.v1.train.AdamOptimizer(learning_rate)
            if self.debias:
                adversary_opt = tf.compat.v1.train.AdamOptimizer(learning_rate)

            classifier_vars = [var for var in tf.compat.v1.trainable_variables() if 'classifier_model' in var.name]
            if self.debias:
                adversary_vars = [var for var in tf.compat.v1.trainable_variables() if 'adversary_model' in var.name]
                # Update classifier parameters
                adversary_grads = {var: grad for (grad, var) in adversary_opt.compute_gradients(pred_protected_attributes_loss,
                                                                                      var_list=classifier_vars)}
            normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)

            classifier_grads = []
            for (grad,var) in classifier_opt.compute_gradients(pred_labels_loss, var_list=classifier_vars):
                if self.debias:
                    unit_adversary_grad = normalize(adversary_grads[var])
                    grad -= tf.reduce_sum(grad * unit_adversary_grad) * unit_adversary_grad
                    grad -= self.adversary_loss_weight * adversary_grads[var]
                classifier_grads.append((grad, var))
            classifier_minimizer = classifier_opt.apply_gradients(classifier_grads, global_step=global_step)

            if self.debias:
                # Update adversary parameters
                with tf.control_dependencies([classifier_minimizer]):
                    adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)

            self.sess.run(tf.compat.v1.global_variables_initializer())
            self.sess.run(tf.compat.v1.local_variables_initializer())

        if self.saved_model:
            if self.verbose:
                print('RETRIEVING SAVED MODEL: {}'.format(self.saved_model), file=sys.stderr)
            try:
                saver = tf.compat.v1.train.import_meta_graph(self.saved_model + '/model.meta')
                saver.restore(self.sess, tf.compat.v1.train.latest_checkpoint('./' + self.saved_model + '/'))
                return self
            except:
                import traceback
                print(sys.exc_info()[0], file=sys.stderr, flush=True)
                print(sys.exc_info()[1], file=sys.stderr, flush=True)
                print(traceback.print_tb(sys.exc_info()[2]), file=sys.stderr, flush=True)
                print('Failed: continuing', file=sys.stderr)

            # Begin training
        with tf.compat.v1.variable_scope(self.scope_name):
            for epoch in range(self.num_epochs):
                shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
                for i in range(num_train_samples//self.batch_size):
                    batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
                    batch_features = features_set.loc[batch_ids]
                    batch_labels = temp_labels[batch_ids]
                    batch_protected_attributes = np.reshape(list(metadata_set[self.protected_attribute_name].loc[batch_ids]), [-1,1])

                    batch_feed_dict = {self.features_ph: batch_features,
                                       self.true_labels_ph: batch_labels,
                                       self.protected_attributes_ph: batch_protected_attributes,
                                       self.keep_prob: 0.8}

                    if self.debias:
                        _, _, pred_labels_loss_value, pred_protected_attributes_loss_vale = self.sess.run([classifier_minimizer,
                                       adversary_minimizer,
                                       pred_labels_loss,
                                       pred_protected_attributes_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0 and self.verbose:
                            print("epoch %d; iter: %d; batch classifier loss: %f; batch adversarial loss: %f" % (epoch, i, pred_labels_loss_value,
                                                                                     pred_protected_attributes_loss_vale),
                                  file=sys.stderr, flush=True)
                    else:
                        _, pred_labels_loss_value = self.sess.run(
                            [classifier_minimizer,
                             pred_labels_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0 and self.verbose:
                            print("epoch %d; iter: %d; batch classifier loss: %f" % (
                                  epoch, i, pred_labels_loss_value),
                                  file=sys.stderr, flush=True)

        if self.saved_model:
            model_name = self.saved_model + '/model'
            if self.verbose:
                print('SAVING MODEL: {}'.format(model_name), file=sys.stderr)
            saver = tf.compat.v1.train.Saver()
            saver.save(self.sess, model_name)
            # print(self.__dict__, file=sys.stderr)

        return self

    def predict(self, features_set, metadata_set):
        """Obtain the predictions for the provided dataset using the fair
        classifier learned.
        """

        if self.seed is not None:
            np.random.seed(self.seed)

        def one_hot(x):
            return np.eye(self.num_labels)[x]
        temp_labels = metadata_set.copy()
        for label_int in self.label_translate:
            label_name = self.label_translate[label_int]
            temp_labels.loc[(temp_labels.label == label_name), 'label'] = label_int
        try:
            temp_labels = np.array([one_hot(x) for x in temp_labels.label])
        except IndexError:
            temp_labels = np.array([np.zeros(len(self.label_translate)) for x in temp_labels.label])

        num_test_samples, _ = np.shape(features_set)

        samples_covered = 0
        pred_labels = []
        while samples_covered < num_test_samples:
            start = samples_covered
            end = samples_covered + self.batch_size
            if end > num_test_samples:
                end = num_test_samples
            batch_ids = np.arange(start, end)
            batch_features = features_set.loc[batch_ids]
            batch_labels = temp_labels[batch_ids]
            batch_protected_attributes = np.reshape(list(metadata_set[self.protected_attribute_name].loc[batch_ids]), [-1,1])

            batch_feed_dict = {self.features_ph: batch_features,
                               self.true_labels_ph: batch_labels,
                               self.protected_attributes_ph: batch_protected_attributes,
                               self.keep_prob: 1.0}

            pred_labels += self.sess.run(self.pred_labels, feed_dict=batch_feed_dict).tolist()
            samples_covered += len(batch_features)

        pred_labels = np.array(pred_labels, dtype=np.float64)
        dataset_new = metadata_set.copy()
        for label_num in self.label_translate:
            dataset_new['pred_score_{}'.format(self.label_translate[label_num])] = pred_labels[:,label_num]
        dataset_new['pred_label'] = [self.label_translate[x] for x in (np.argmax(pred_labels, axis=1)).astype(np.int32).tolist()]

        return dataset_new

In [5]:
import sys
#sys.path.append("../")

import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.stats import pearsonr
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

tf.compat.v1.disable_eager_execution()

In [6]:
compas = pd.read_csv('compas.csv', index_col=False)

In [7]:
def prepare_meta_and_features(df, protected_attribute_name):
    meta = df[[protected_attribute_name, 'label']]
    features = df.drop(columns=[protected_attribute_name, 'label'])
    
    for col in features.columns:
        data = features[col]
        if pd.api.types.is_numeric_dtype(data):
            data -= np.min(data,axis=0)
            data /= (np.max(data,axis=0) - np.min(data,axis=0))
            features[col] = data
        else:
            dummies = pd.get_dummies(data, prefix=col)
            features[col] = dummies
            
    meta['label'] = meta.label.astype(int)
    return meta, features

meta, features = prepare_meta_and_features(compas, 'race')

features_train, features_test, meta_train, meta_test = train_test_split(features, meta, test_size=0.2, random_state=42, stratify=meta.label)

meta_train.reset_index(drop=True, inplace=True)
meta_test.reset_index(drop=True, inplace=True)
features_train.reset_index(drop=True, inplace=True)
features_test.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['label'] = meta.label.astype(int)


In [8]:
def get_predictions(debias=True, fairness_def='parity', adv_loss_weight=2, prot_attr='race'):
    sess = tf.compat.v1.Session()
    model = AdversarialDebiasingMulti(
        protected_attribute_name=prot_attr,
        num_labels=len(meta_train.label.unique()),
        scope_name='biased_classifier',
        debias=debias,
        adversary_loss_weight=adv_loss_weight,
        fairness_def=fairness_def,
        verbose=False,
        num_epochs=64,
        classifier_num_hidden_units_1=60,
        classifier_num_hidden_units_2=20,
        sess=sess
    )
    model.fit(features_train, meta_train)
    predictions = model.predict(features_test, meta_test)
    sess.close()
    tf.compat.v1.reset_default_graph()
    return predictions

***

---


# Binary Protected Attribute (Race)

In [9]:
def print_stats(df):
    print('PERFORMANCE:\n')
    print(classification_report(df.label, df.pred_label))
    print('\nBIAS:')
    rw = len(df.loc[(df.race==1) & (df.pred_label==1)]) / len(df.loc[df.race==1])
    print('\nproportion of White people predicted to reoffend: ' + str(rw))
    rn = len(df.loc[(df.race==0) & (df.pred_label==1)]) / len(df.loc[df.race==0])
    print('proportion of Nonwhite people predicted to reoffend: ' + str(rn))
    print('\tRATE GAP = ' + str(rw - rn))
    tprw = len(df.loc[(df.race==1) & (df.pred_label==1) & (df.label==1)]) / len(df.loc[(df.race==1) & (df.label==1)])
    print('\nTPR for White people: ' + str(tprw))
    tprn = len(df.loc[(df.race==0) & (df.pred_label==1) & (df.label==1)]) / len(df.loc[(df.race==0) & (df.label==1)])
    print('TPR for Nonwhite people: ' + str(tprn))
    print('\tTPR GAP = ' + str(tprw - tprn))
    fprw = len(df.loc[(df.race==1) & (df.pred_label==1) & (df.label==0)]) / len(df.loc[(df.race==1) & (df.label==0)])
    print('\nFPR for White people: ' + str(fprw))
    fprn = len(df.loc[(df.race==0) & (df.pred_label==1) & (df.label==0)]) / len(df.loc[(df.race==0) & (df.label==0)])
    print('FPR for Nonwhite people: ' + str(fprn))
    print('\tFPR GAP = ' + str(fprw - fprn))

## Baseline

In [10]:
predictions = get_predictions(debias=False)
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.65      0.75      0.70       672
           1       0.63      0.51      0.56       562

    accuracy                           0.64      1234
   macro avg       0.64      0.63      0.63      1234
weighted avg       0.64      0.64      0.64      1234


BIAS:

proportion of White people predicted to reoffend: 0.26277372262773724
proportion of Nonwhite people predicted to reoffend: 0.4155528554070474
	RATE GAP = -0.15277913277931016

TPR for White people: 0.34782608695652173
TPR for Nonwhite people: 0.571072319201995
	TPR GAP = -0.22324623224547324

FPR for White people: 0.208
FPR for Nonwhite people: 0.2677725118483412
	FPR GAP = -0.05977251184834123


## Parity Fairness

In [11]:
predictions = get_predictions(fairness_def='parity', adv_loss_weight=15)
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.67      0.77      0.71       672
           1       0.66      0.54      0.60       562

    accuracy                           0.67      1234
   macro avg       0.66      0.66      0.66      1234
weighted avg       0.66      0.67      0.66      1234


BIAS:

proportion of White people predicted to reoffend: 0.2846715328467153
proportion of Nonwhite people predicted to reoffend: 0.4155528554070474
	RATE GAP = -0.13088132256033208

TPR for White people: 0.40993788819875776
TPR for Nonwhite people: 0.5935162094763092
	TPR GAP = -0.18357832127755147

FPR for White people: 0.204
FPR for Nonwhite people: 0.24644549763033174
	FPR GAP = -0.04244549763033176


## Equal Odds Fairness

In [12]:
predictions = get_predictions(fairness_def='equal_odds', adv_loss_weight=50)
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.66      0.69      0.67       672
           1       0.61      0.56      0.58       562

    accuracy                           0.63      1234
   macro avg       0.63      0.63      0.63      1234
weighted avg       0.63      0.63      0.63      1234


BIAS:

proportion of White people predicted to reoffend: 0.35036496350364965
proportion of Nonwhite people predicted to reoffend: 0.46051032806804376
	RATE GAP = -0.11014536456439411

TPR for White people: 0.4906832298136646
TPR for Nonwhite people: 0.5935162094763092
	TPR GAP = -0.10283297966264465

FPR for White people: 0.26
FPR for Nonwhite people: 0.3341232227488152
	FPR GAP = -0.07412322274881517


***
# Continuous Protected Attribute (Age)

In [13]:
meta, features = prepare_meta_and_features(compas, 'age')

features_train, features_test, meta_train, meta_test = train_test_split(features, meta, test_size=0.2, random_state=42, stratify=meta.label)

meta_train.reset_index(drop=True, inplace=True)
meta_test.reset_index(drop=True, inplace=True)
features_train.reset_index(drop=True, inplace=True)
features_test.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['label'] = meta.label.astype(int)


In [14]:
def print_stats(df):
    print('PERFORMANCE:\n')
    print(classification_report(df.label, df.pred_label))
    print('\nBIAS:')
    corr = pearsonr(df.age, df.pred_label)[0]
    corr_1 = pearsonr(df.loc[df.label==1].age, df.loc[df.label==1].pred_label)[0]
    corr_0 = pearsonr(df.loc[df.label==0].age, df.loc[df.label==0].pred_label)[0]
    print('\nCorrelation between age and predicted label: ' + str(corr))
    print('\nCorrelation between age and predicted label, conditional on true label=1: ' + str(corr_1))
    print('\nCorrelation between age and predicted label, conditional on true label=0: ' + str(corr_0))

## Baseline

In [15]:
predictions = get_predictions(debias=False, prot_attr='age')
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.68      0.71      0.69       672
           1       0.63      0.59      0.61       562

    accuracy                           0.66      1234
   macro avg       0.65      0.65      0.65      1234
weighted avg       0.65      0.66      0.65      1234


BIAS:

Correlation between age and predicted label: -0.21704837297500712

Correlation between age and predicted label, conditional on true label=1: -0.13651086589086187

Correlation between age and predicted label, conditional on true label=0: -0.20308977275693996


## Parity Fairness

In [16]:
predictions = get_predictions(prot_attr='age', fairness_def='parity', adv_loss_weight=0.001)
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.70      0.57      0.63       672
           1       0.58      0.71      0.64       562

    accuracy                           0.63      1234
   macro avg       0.64      0.64      0.63      1234
weighted avg       0.64      0.63      0.63      1234


BIAS:

Correlation between age and predicted label: -0.2845084632439279

Correlation between age and predicted label, conditional on true label=1: -0.21988036081725576

Correlation between age and predicted label, conditional on true label=0: -0.26635022690514965


## Equal Odds Fairness

In [17]:
predictions = get_predictions(prot_attr='age', fairness_def='equal_odds', adv_loss_weight=0.001)
print_stats(predictions)

PERFORMANCE:

              precision    recall  f1-score   support

           0       0.75      0.02      0.04       672
           1       0.46      0.99      0.63       562

    accuracy                           0.46      1234
   macro avg       0.60      0.51      0.34      1234
weighted avg       0.62      0.46      0.31      1234


BIAS:

Correlation between age and predicted label: -0.14938091723340735

Correlation between age and predicted label, conditional on true label=1: -0.12330738980768029

Correlation between age and predicted label, conditional on true label=0: -0.1538273315296457
