# FAIR IN-PROCESSING

This notebook implements the Adversarial Debiasing in-processor [(Zhang et al. 2018)](https://dl.acm.org/doi/abs/10.1145/3278721.3278779).

The modeling is performed separately for each combination of training folds. This is controlled with `use_fold` variable. To fit adversarial debiasing on a different combination of training folds, set `use_fold` to a specific value and restar the kernel.

A further analysis of the processor outputs is performed in `code_05_inprocess3.R`.

The notebook loads the data exported in `code_00_partitinoing.ipynb` and applies in-processors. The processor predictions are exported as CSV files.

In [1]:
##### PACKAGES

# working paths
%run code_00_working_paths.py

import pickle
import numpy as np
import time
import pandas as pd

import tensorflow as tf

from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector
from aif360.algorithms.inprocessing.adversarial_debiasing import AdversarialDebiasing

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt
import os

import sys
sys.path.append(func_path)

from load_data import load_dataset

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


## 1. Parameters and preparations

In [2]:
##### PARAMETERS

# specify data set
# one of ['data1', 'data2', ..., 'data50']
data = 'data50'

# partitioning
num_folds = 5
use_fold  = 4 # one of [0, 1, ..., 4 (num_folds-1)]
seed      = 1

In [3]:
##### IN-PROCESSOR PARAMS

adversary_loss_weight = 0.1 # other options: [0.1, 0.01, 0.001]

## 2. Data import

In [4]:
##### RANDOM SEED

np.random.seed(seed)

In [5]:
##### LOAD PARTITIONING

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared', 'data' + dataset_number)

# Construct the full file path
file_path = os.path.join(input_dir, data + '_orig_test.pkl')

# Load the dataset
with open(file_path, 'rb') as file:
    dataset_orig_test = pickle.load(file)
    
# Convert to dataframe and print the shape
te = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)

(240, 6)


In [6]:
##### DATA PREP

# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

## 3. Fair processing

In [7]:
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# Disable TensorFlow eager execution
tf.compat.v1.disable_eager_execution()

# Create a TensorFlow session
sess = tf.Session()

# Define the normalization function
def safe_normalize(x):
    norm = tf.norm(x)
    print("x:", x)
    print("norm:", norm)
    return tf.cond(tf.not_equal(norm, 0), lambda: x / norm, lambda: x)

##### MODELING

# timer
cv_start = time.time()

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared', 'data' + dataset_number)
output_dir = os.path.join(res_path, 'inprocess2', 'intermediate', 'data' + dataset_number)

# loop through training folds
for fold in range(num_folds):
    
    ##### LOAD DATA
    
    # select fold combination
    if fold != use_fold:
        continue

    # feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    # import data subsets
    train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
    valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
    test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')
        
    # Load the dataset
    with open(train_path, 'rb') as file:
        data_train = pickle.load(file)
    with open(valid_path, 'rb') as file:
        data_valid = pickle.load(file)
    with open(test_path, 'rb') as file:
        data_test = pickle.load(file)

    ##### MODELING

    # fit adversarial debiasing
    with tf.variable_scope('debiased_classifier', reuse=tf.AUTO_REUSE):
        debiased_model = AdversarialDebiasing(privileged_groups     = privileged_groups,
                                              unprivileged_groups   = unprivileged_groups,
                                              debias                = True,
                                              adversary_loss_weight = adversary_loss_weight,
                                              scope_name            = 'debiased_classifier',
                                              sess                  = sess)
        debiased_model.fit(data_train)
    
    # apply the model to valid data
    scores_valid = debiased_model.predict(data_valid).scores
    # Flatten scores_valid to 1D
    scores_valid_flat = scores_valid.flatten()
    advdebias_predictions = pd.DataFrame()
    advdebias_predictions['scores'] = scores_valid_flat
    advdebias_predictions['targets'] = data_valid.labels.flatten()
    advdebias_predictions.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_AD_' + str(adversary_loss_weight) + str(use_fold) + '_predictions_valid.csv'), 
                                 index=None, 
                                 header=True)

    # Apply the model to test data
    scores_test = debiased_model.predict(data_test).scores
    scores_test_flat = scores_test.flatten()

    advdebias_predictions = pd.DataFrame()
    advdebias_predictions['scores'] = scores_test_flat
    advdebias_predictions.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_AD_' + str(adversary_loss_weight) + str(use_fold) + '_predictions_test.csv'), 
                                 index=None, 
                                 header=True)

    # print performance
    print('')
    print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))







Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term








------------------------------
- FOLD 4...
------------------------------
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








epoch 0; iter: 0; batch classifier loss: 0.674891; batch adversarial loss: 0.899301
epoch 1; iter: 0; batch classifier loss: 0.669905; batch adversarial loss: 0.876246
epoch 2; iter: 0; batch classifier loss: 0.655234; batch adversarial loss: 0.880912
epoch 3; iter: 0; batch classifier loss: 0.670982; batch adversarial loss: 0.903287
epoch 4; iter: 0; batch classifier loss: 0.657972; batch adversarial loss: 0.892820
epoch 5; iter: 0; batch classifier loss: 0.635461; batch adversarial loss: 0.892677
epoch 6; iter: 0; batch classifier loss: 0.658133; batch adversarial loss: 0.904168
epoch 7; iter: 0; batch classifier loss: 0.652292; batch adversarial loss: 0.910475
epoch 8; iter: 0; batch classifier loss: 0.626228; batch adversarial loss: 0.904101
epoch 9; iter: 0; batch classifier loss: 0.624006; batch adversarial loss: 0.892282
epoch 10; iter: 0; batch classifier loss: 0.642012; batch adversarial loss: 0.891625
epoch 11; iter: 0; batch classifier loss: 0.713761; batch adversarial loss: