# Pre1 + in1
1. Reweighting + Prejudice Remover
2. Disparate Impact Remover + Prejudice Remover

In [11]:
##### PACKAGES

# working paths
%run code_00_working_paths.py

import pickle
import numpy as np
import time
import tensorflow as tf

from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.metrics.utils import compute_boolean_conditioning_vector
from aif360.algorithms.inprocessing.adversarial_debiasing import AdversarialDebiasing
from aif360.algorithms.inprocessing import PrejudiceRemover

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt
import os
import pandas as pd

import sys
sys.path.append(func_path)

from load_data import load_dataset

## Parameters and preparations

In [12]:
##### PARAMETERS

# specify data set
# one of ['data1', 'data2', ..., 'data50']
data = 'datacorr1'

# partitioning
num_folds = 10
seed      = 1

In [13]:
##### PRE-PROCESSOR PARAMS

all_lambda = [0.5,0.6,0.7,0.8,0.9,1.0]

In [14]:
##### IN-PROCESSOR PARAMS

all_eta = [1, 15, 50, 70, 100]

## Data import

In [15]:
##### RANDOM SEED

np.random.seed(seed)

In [16]:
##### LOAD PARTITIONING

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')

# Construct the full file path
file_path = os.path.join(input_dir, data + '_orig_test.pkl')

# Load the dataset
with open(file_path, 'rb') as file:
    dataset_orig_test = pickle.load(file)
    
# Convert to dataframe and print the shape
te = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)

(240, 6)


In [17]:
##### DATA PREP

# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

## Fair processing

In [18]:
##### MODELING

# timer
cv_start = time.time()

# Create the directory path for inprocessor output
output_dir = os.path.join(res_path, 'pre1in1')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# preprocessing and inprocessing loop
print('-' * 30)
print('- METHOD: RW...')
print('-' * 30)

# loop through fold combinations
for fold in range(num_folds):
    
    # feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    ##### LOAD DATA
    # import data subsets
    train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
    valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
    test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')

    # Load the dataset
    with open(train_path, 'rb') as file:
        data_train = pickle.load(file)
    with open(valid_path, 'rb') as file:
        data_valid = pickle.load(file)
    with open(test_path, 'rb') as file:
        data_test = pickle.load(file)

    # Reweighing
    RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
    RW.fit(data_train)

    # Transform the data
    dataset_transf_train = RW.transform(data_train)
    dataset_transf_valid = RW.transform(data_valid)
    dataset_transf_test = RW.transform(data_test)

    ##### IN-PROCESSOR: PREJUDICE REMOVER

    # placeholders
    pr_predictions_valid = pd.DataFrame()
    pr_predictions_test = pd.DataFrame()

    for eta in all_eta:
        print('--- eta: %.2f' % eta)
        colname = 'eta_' + str(eta)

        # Fit Prejudice Remover
        debiased_model = PrejudiceRemover(eta=eta, sensitive_attr=protected, class_attr='target')
        debiased_model.fit(dataset_transf_train)

        # Predict validation scores
        dataset_debiasing_valid = debiased_model.predict(dataset_transf_valid)
        scores = dataset_debiasing_valid.scores
        pr_predictions_valid[colname] = sum(scores.tolist(), [])

        # Predict test scores
        dataset_debiasing_test = debiased_model.predict(dataset_transf_test)
        scores = dataset_debiasing_test.scores
        pr_predictions_test[colname] = sum(scores.tolist(), [])

    # Export CSV
    pr_predictions_valid.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_PR_predictions_valid_RW.csv'), index=None, header=True)
    pr_predictions_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_PR_predictions_test_RW.csv'), index=None, header=True)
    print('')

##### END LOOP

# feedback
print('')

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- METHOD: RW...
------------------------------
------------------------------
- FOLD 0...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 1...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 2...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 3...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 4...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 5...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

----------------------

In [19]:
##### MODELING

# timer
cv_start = time.time()

# Create the directory path for inprocessor output
output_dir = os.path.join(res_path, 'pre1in1')

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# preprocessing and inprocessing loop
print('-' * 30)
print('- METHOD: DI...')
print('-' * 30)

# loop through fold combinations
for fold in range(num_folds):
    
    # feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    ##### LOAD DATA
    # import data subsets
    train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
    valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
    test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')

    # Load the dataset
    with open(train_path, 'rb') as file:
        data_train = pickle.load(file)
    with open(valid_path, 'rb') as file:
        data_valid = pickle.load(file)
    with open(test_path, 'rb') as file:
        data_test = pickle.load(file)

    for i in all_lambda:
        # Disparate Impact Remover
        di = DisparateImpactRemover(repair_level=i, sensitive_attribute=protected)
        
        # Transform the data
        dataset_transf_train = di.fit_transform(data_train)
        dataset_transf_valid = di.fit_transform(data_valid)
        dataset_transf_test = di.fit_transform(data_test)
        
        ##### IN-PROCESSOR: PREJUDICE REMOVER
        
        # placeholders
        pr_predictions_valid = pd.DataFrame()
        pr_predictions_test = pd.DataFrame()
        
        for eta in all_eta:
            print('--- eta: %.2f' % eta)
            colname = 'eta_' + str(eta)
            
            # Fit Prejudice Remover
            debiased_model = PrejudiceRemover(eta=eta, sensitive_attr=protected, class_attr='target')
            debiased_model.fit(dataset_transf_train)
            
            # Predict validation scores
            dataset_debiasing_valid = debiased_model.predict(dataset_transf_valid)
            scores = dataset_debiasing_valid.scores
            pr_predictions_valid[colname] = sum(scores.tolist(), [])
            
            # Predict test scores
            dataset_debiasing_test = debiased_model.predict(dataset_transf_test)
            scores = dataset_debiasing_test.scores
            pr_predictions_test[colname] = sum(scores.tolist(), [])
        
        # Export CSV
        pr_predictions_valid.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_PR_predictions_valid_DI_' + str(i) + '.csv'), index=None, header=True)
        pr_predictions_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_PR_predictions_test_DI_' + str(i) + '.csv'), index=None, header=True)
        print('')

##### END LOOP

# feedback
print('')

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- METHOD: DI...
------------------------------
------------------------------
- FOLD 0...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

------------------------------
- FOLD 1...
------------------------------
--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
--- eta: 15.00
--- eta: 50.00
--- eta: 70.00
--- eta: 100.00

--- eta: 1.00
