# FAIR PRE-PROCESSING

This notebook implements the following pre-processors:
- Reweighting [(Calders et al. 2009)](https://ieeexplore.ieee.org/abstract/document/5360534)
- Disparate Impact Remover [(Feldman et al. 2015)](https://dl.acm.org/doi/abs/10.1145/2783258.2783311?casa_token=hPPsvh9w2QEAAAAA:RE90pNifv99Y9yCMgE4O1vOquljiAtjVCQQ3UgFDHIgcn2J21J5ry6HCv2iXXTX2Gw9e1VBbS07j)

A further analysis of the processor outputs is performed in `code_02_preprocess2.R` and `code_03_preprocess3.R`.

The notebook loads the data exported in `code_00_partitinoing.ipynb` and applies pre-processors. The transformed training and test data is exported as CSV files.

In [233]:
##### PACKAGES

# working paths
%run code_00_working_paths.py

import pickle
import numpy as np
import time

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.algorithms.preprocessing import DisparateImpactRemover

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt
import os

import sys
sys.path.append(func_path)

from load_data import load_dataset

## 1. Parameters and preparations

In [234]:
##### PARAMETERS

# specify data set
# one of ['data1', 'data2', ..., 'data50']
data = 'datacorr5'

# partitioning
num_folds = 10
seed      = 1

In [235]:
##### PRE-PROCESSOR PARAMS

all_lambda = [0.5,0.6,0.7,0.8,0.9,1.0]

## 2. Data import

In [236]:
##### RANDOM SEED

np.random.seed(seed)

In [237]:
##### LOAD PARTITIONING

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')

# Construct the full file path
file_path = os.path.join(input_dir, data + '_orig_test.pkl')

# Load the dataset
with open(file_path, 'rb') as file:
    dataset_orig_test = pickle.load(file)
    
# Convert to dataframe and print the shape
te = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)


(240, 6)


In [238]:
##### DATA PREP

# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

## 3. Fair processing

In [239]:
##### MODELING

# timer
cv_start = time.time()

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)/
input_dir = os.path.join(data_path, 'prepared')
output_dir = os.path.join(res_path, 'preprocess1')

# list of processors
methods = ['RW', 'DI']

# processing loop
for m in methods:
    
    # feedback
    print('-'*30)
    print('- METHOD: ' + m + '...')
    print('-'*30)

    # loop through fold combinations
    for fold in range(num_folds):
    
        ##### LOAD DATA
        # import data subsets
        train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
        valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
        test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')
        
        # Load the dataset
        with open(train_path, 'rb') as file:
            data_train = pickle.load(file)
        with open(valid_path, 'rb') as file:
            data_valid = pickle.load(file)
        with open(test_path, 'rb') as file:
            data_test = pickle.load(file)
            
            
        ##### MODELING
    
        # reweighing
        if m == 'RW':
            
            # fit pre-processor
            RW = Reweighing(unprivileged_groups = unprivileged_groups,
                            privileged_groups   = privileged_groups)
            RW.fit(data_train)

            # train processing
            dataset_transf_train = RW.transform(data_train)
            w_train   = dataset_transf_train.instance_weights.ravel()
            out_train = dataset_transf_train.convert_to_dataframe()[0]
            out_train = out_train.sample(n = out_train.shape[0], replace = True, weights = w_train)

            # valid classification
            dataset_transf_valid = RW.transform(data_valid)
            w_valid   = dataset_transf_valid.instance_weights.ravel()
            out_valid = dataset_transf_valid.convert_to_dataframe()[0]
            out_valid = out_valid.sample(n = out_valid.shape[0], replace = True, weights = w_valid)

            # test processing
            dataset_transf_test = RW.transform(data_test)
            w_test   = dataset_transf_test.instance_weights.ravel()
            out_test = dataset_transf_test.convert_to_dataframe()[0]
            out_test = out_test.sample(n = out_test.shape[0], replace = True, weights = w_test)

            # check transformation
            assert np.abs(dataset_transf_train.instance_weights.sum() - data_train.instance_weights.sum()) < 1e-6

            # check results
            metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, 
                                                           unprivileged_groups = unprivileged_groups,
                                                           privileged_groups   = privileged_groups)
            print('-- achieved a statistical parity difference between unprivileged and privileged groups = %f' % metric_transf_train.mean_difference())
            
            # export CSV
            out_train.to_csv(os.path.join(output_dir, data + '_' +  str(fold) + '_pre_train_' + m + '.csv'), index = None, header = True)
            out_valid.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_pre_valid_' + m + '.csv'), index = None, header = True)
            out_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_pre_test_'  + m + '.csv'), index = None, header = True)
        

        # disparate impact remover
        if m == 'DI':     
            
            # feedback
            print('-- FOLD ' + str(fold) + '...')
            
            # loop through different repair levels
            for i in all_lambda:
                
                # fit pre-processor
                di = DisparateImpactRemover(repair_level = i, sensitive_attribute = protected)
                di.fit(data_train)
                                          
                # train processing
                dataset_transf_train = di.fit_transform(data_train)
                out_train            = dataset_transf_train.convert_to_dataframe()[0]

                # valid processing
                dataset_transf_valid = di.fit_transform(data_valid)
                out_valid            = dataset_transf_valid.convert_to_dataframe()[0]

                # test processing
                dataset_transf_test = di.fit_transform(data_test)
                out_test            = dataset_transf_test.convert_to_dataframe()[0]
               
                # export CSV
                out_train.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_pre_train_' + m + '_' + str(i) + '.csv'), index = None, header = True)
                out_valid.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_pre_valid_' + m + '_' + str(i) + '.csv'), index = None, header = True)
                out_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_pre_test_'  + m + '_' + str(i) + '.csv'), index = None, header = True)
                                 
    # feedback
    print('')
    
# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- METHOD: RW...
------------------------------
-- achieved a statistical parity difference between unprivileged and privileged groups = -0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = -0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = 0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = -0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = 0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = -0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = -0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = 0.000000
-- achieved a statistical parity difference between unprivileged and privileged groups = 0.000000
-- achieved a statistical parity di