# FAIR POST-PROCESSING

This notebook implements the Reject Option Classification post-processor [(Kamiran et al. 2012)](https://ieeexplore.ieee.org/abstract/document/6413831).

The notebook applies reject option classification post-processir to classifier predictions. It loads the data exported in `code_00_partitioning.ipynb` and predictions of base classifers produced in `code_08_postprocess1.R`. The post-processed predictions are exported as CSV files. A further analysis of the processor outputs is performed in `code_12_postprocess5.R`.

In [1]:
##### PACKAGES

# working paths
%run code_00_working_paths.py

import pickle
import numpy as np
import time

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification import RejectOptionClassification
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt
import os

import sys
sys.path.append(func_path)

from load_data import load_dataset
import pandas as pd

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


## 1. Parameters and preparations

In [2]:
##### PARAMETERS

# specify data set
# one of ['data1', ..., 'data50']
data = 'data1'

# partitioning
num_folds = 10
seed      = 1

In [3]:
##### POST-PROCESSOR PARAMS

metric_name      = 'Statistical parity difference'
num_class_thresh = 100
num_ROC_margin   = 50
all_bound        = [0.1, 0.2, 0.3]

## 2. Data import

In [4]:
##### RANDOM SEED

np.random.seed(seed)

In [5]:
##### LOAD PARTITIONING

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')

# Construct the full file path
file_path = os.path.join(input_dir, data + '_orig_test.pkl')

# Load the dataset
with open(file_path, 'rb') as file:
    dataset_orig_test = pickle.load(file)
    
# Convert to dataframe and print the shape
te = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)

(240, 6)


In [6]:
##### DATA PREP
# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

## 3. Fair processing

In [7]:
##### MODELING

# timer
cv_start = time.time()

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')
output_dir = os.path.join(res_path, 'pre1post4', 'intermediate')

# base models
model_names = ['glm', 
                "rf", 
                #"xgbTree", 
                "nnet"]

# loop through folds
for fold in range(num_folds):
    
    ##### LOAD DATA
    
    # feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    # import data subsets
    train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
    valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
    test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')
        
    # Load the dataset
    with open(train_path, 'rb') as file:
        data_train = pickle.load(file)
    with open(valid_path, 'rb') as file:
        data_valid = pickle.load(file)
    with open(test_path, 'rb') as file:
        data_test = pickle.load(file)


    ##### MODELING
    
    # import prediction results from R
    dataset_trainResults_valid = pd.read_csv(res_path + 'postprocess1/' + 'intermediate/' + data + '_' + str(fold) + '_POST_training_results_dval.csv')
    dataset_trainResults_test  = pd.read_csv(res_path + 'postprocess1/' + 'intermediate/' + data + '_' + str(fold) + '_POST_training_results_dtest.csv')
    
    # copy preds
    dataset_orig_valid_pred = data_valid.copy(deepcopy = True)
    dataset_orig_test_pred  = data_test.copy(deepcopy  = True)
    
    
    # loop through bound values
    for i in all_bound:
        
        # feedback
        print('-- BOUND ' + str(i) + '...')
    
        # placeholder
        ROC_test = pd.DataFrame()

        # loop through base classifiers
        for m in model_names:

            # extract validation preds
            scores_valid = np.array(dataset_trainResults_valid[m + '_scores']).reshape(len(dataset_trainResults_valid.index),1)
            labels_valid = np.where(dataset_trainResults_valid[m + '_class'] == 'Parole', 1.0, 2.0).reshape(len(dataset_trainResults_valid.index), 1)

            # extract test preds
            scores_test = np.array(dataset_trainResults_test[m + '_scores']).reshape(len(dataset_trainResults_test.index),1)
            labels_test = np.where(dataset_trainResults_test[m + '_class'] == 'Parole', 1.0, 2.0).reshape(len(dataset_trainResults_test.index), 1)

            # write predictions
            dataset_orig_valid_pred.scores = scores_valid
            dataset_orig_valid_pred.labels = labels_valid
            dataset_orig_test_pred.scores  = scores_test
            dataset_orig_test_pred.labels  = labels_test

            # fit ROC
            ROC = RejectOptionClassification(unprivileged_groups = unprivileged_groups, 
                                                privileged_groups   = privileged_groups, 
                                                num_class_thresh    = num_class_thresh, 
                                                num_ROC_margin      = num_ROC_margin,
                                                metric_name         = metric_name,
                                                metric_ub           = i, 
                                                metric_lb           = -i)
            ROC = ROC.fit(data_valid, dataset_orig_valid_pred)

            # predict test scores
            dataset_transf_test_pred    = ROC.predict(dataset_orig_test_pred)
            ROC_test[m + "_fairScores"] = dataset_transf_test_pred.scores.flatten()
            label_names                 = np.where(dataset_transf_test_pred.labels == 1, 'Parole', 'Noparole')
            ROC_test[m + "_fairLabels"] = label_names

        # export CSV
        ROC_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_ROC_' + str(i) + '_predictions_test.csv'),  index = None, header=True)
         
    # feedback
    print('')

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- FOLD 0...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 1...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 2...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 3...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 4...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 5...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 6...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 7...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

----------------