# Pre1 + post1 + post4
1. Reweighting + Reject Option Classification
2. Disparate Impact Remover + Reject Option Classification

In [11]:
##### PACKAGES

# working paths
%run code_00_working_paths.py

import pickle
import numpy as np
import time

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.postprocessing.reject_option_classification import RejectOptionClassification
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt
import os

import sys
sys.path.append(func_path)

from load_data import load_dataset
import pandas as pd

## Parameters and preparations

In [12]:
##### PARAMETERS

# specify data set
data = 'datacorr1'

# partitioning
num_folds = 10
seed      = 1

In [13]:
##### PRE-PROCESSOR PARAMS

all_lambda = [0.5,0.6,0.7,0.8,0.9,1.0]

In [14]:
##### POST-PROCESSOR PARAMS

metric_name      = 'Statistical parity difference'
num_class_thresh = 100
num_ROC_margin   = 50
all_bound        = [0.1, 0.2, 0.3]

## Data import

In [15]:
##### RANDOM SEED

np.random.seed(seed)

In [16]:
##### LOAD PARTITIONING

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')

# Construct the full file path
file_path = os.path.join(input_dir, data + '_orig_test.pkl')

# Load the dataset
with open(file_path, 'rb') as file:
    dataset_orig_test = pickle.load(file)
    
# Convert to dataframe and print the shape
te = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)

(240, 6)


In [17]:
##### DATA PREP
# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

## Fair processing

In [18]:
# RW + ROC
##### MODELING

# timer
cv_start = time.time()

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared')
output_dir = os.path.join(res_path, 'in2post1post4')

# base models
model_names = ['glm', "rf", "nnet"]

# loop through folds
#for fold in range(num_folds):

fold = 0

##### LOAD DATA

# feedback
print('-'*30)
print('- FOLD ' + str(fold) + '...')
print('-'*30)

# import data subsets
train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')

# Load the dataset
with open(train_path, 'rb') as file:
    data_train = pickle.load(file)
with open(valid_path, 'rb') as file:
    data_valid = pickle.load(file)
with open(test_path, 'rb') as file:
    data_test = pickle.load(file)


##### MODELING

# import prediction results from R
dataset_trainResults_valid = pd.read_csv(res_path + 'in2post1/' + data + '_' + str(fold) + '_AD_POST_training_results_dval.csv')
dataset_trainResults_test  = pd.read_csv(res_path + 'in2post1/' + data + '_' + str(fold) + '_AD_POST_training_results_dtest.csv')

# Convert to DataFrame
data_valid_df = data_valid.convert_to_dataframe()[0]
data_test_df = data_test.convert_to_dataframe()[0]

# loop through bound values
for i in all_bound:

    # feedback
    print('-- BOUND ' + str(i) + '...')

    # placeholder
    ROC_test = pd.DataFrame()

    # loop through base classifiers
    for m in model_names:

        # extract validation preds
        scores_valid = np.array(dataset_trainResults_valid[m + '_scores']).reshape(len(dataset_trainResults_valid.index),1)
        labels_valid = np.where(dataset_trainResults_valid[m + '_class'] == 'Parole', 1.0, 2.0).reshape(len(dataset_trainResults_valid.index), 1)

        # extract test preds
        scores_test = np.array(dataset_trainResults_test[m + '_scores']).reshape(len(dataset_trainResults_test.index),1)
        labels_test = np.where(dataset_trainResults_test[m + '_class'] == 'Parole', 1.0, 2.0).reshape(len(dataset_trainResults_test.index), 1)

        # write predictions to DataFrame
        data_valid_df['scores'] = scores_valid
        data_valid_df['labels'] = labels_valid
        data_test_df['scores'] = scores_test
        data_test_df['labels'] = labels_test

        # replace 'race' with the actual column name for your protected attribute
        protected_attribute_name = 'race'  # replace with your actual column name

        # create BinaryLabelDataset
        dataset_valid_bl = BinaryLabelDataset(df=data_valid_df,
                                              label_names=['labels'],
                                              protected_attribute_names=[protected_attribute_name],  
                                              favorable_label=1.0,
                                              unfavorable_label=2.0)
        dataset_test_bl = BinaryLabelDataset(df=data_test_df,
                                             label_names=['labels'],
                                             protected_attribute_names=[protected_attribute_name],  
                                             favorable_label=1.0,
                                             unfavorable_label=2.0)

        # fit ROC
        ROC = RejectOptionClassification(unprivileged_groups=unprivileged_groups, 
                                         privileged_groups=privileged_groups, 
                                         num_class_thresh=num_class_thresh, 
                                         num_ROC_margin=num_ROC_margin,
                                         metric_name=metric_name,
                                         metric_ub=i, 
                                         metric_lb=-i)
        ROC = ROC.fit(dataset_valid_bl, dataset_valid_bl)

        # predict test scores
        dataset_transf_test_pred = ROC.predict(dataset_test_bl)
        ROC_test[m + "_fairScores"] = dataset_transf_test_pred.scores.flatten()
        label_names = np.where(dataset_transf_test_pred.labels == 1, 'Parole', 'Noparole')
        ROC_test[m + "_fairLabels"] = label_names

    # export CSV
    ROC_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_ROC_' + str(i) + '_AD_predictions_test.csv'),  index=None, header=True)

# feedback
print('')

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- FOLD 0...
------------------------------
-- BOUND 0.1...


  warn("Unable to satisy fairness constraints")
  warn("Unable to satisy fairness constraints")
  warn("Unable to satisy fairness constraints")


-- BOUND 0.2...


  warn("Unable to satisy fairness constraints")
  warn("Unable to satisy fairness constraints")
  warn("Unable to satisy fairness constraints")


-- BOUND 0.3...


  warn("Unable to satisy fairness constraints")




Finished in 0.85 minutes


In [None]:
# DI + ROC
##### MODELING

# timer
cv_start = time.time()

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
input_dir = os.path.join(data_path, 'prepared/')
output_dir = os.path.join(res_path, 'pre1post1post4')

# base models
model_names = ['glm', 'rf', 'nnet']

# Loop through folds
for fold in range(num_folds):
    
    # Feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    # import data subsets
    train_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_train.pkl')
    valid_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_valid.pkl')
    test_path = os.path.join(input_dir, data + '_scaled_' + str(fold) + '_test.pkl')

    # Load the dataset
    with open(train_path, 'rb') as file:
        data_train = pickle.load(file)
    with open(valid_path, 'rb') as file:
        data_valid = pickle.load(file)
    with open(test_path, 'rb') as file:
        data_test = pickle.load(file)

    # Import prediction results from R
    dataset_trainResults_valid = pd.read_csv(res_path + 'pre1post1/' + data + '_' + str(fold) + '_POST_training_results_dval.csv')
    dataset_trainResults_test = pd.read_csv(res_path + 'pre1post1/' + data + '_' + str(fold) + '_POST_training_results_dtest.csv')

    # Extract underlying DataFrame
    data_valid_df = data_valid.convert_to_dataframe()[0]
    data_test_df = data_test.convert_to_dataframe()[0]

    # Ensure 'protected_attribute' column is present
    if 'protected_attribute' not in data_valid_df.columns:
        data_valid_df['protected_attribute'] = data_valid.protected_attributes[:, 0]
    if 'protected_attribute' not in data_test_df.columns:
        data_test_df['protected_attribute'] = data_test.protected_attributes[:, 0]

    # Ensure 'race' is the actual column name of the protected attribute
    protected_attribute_name = 'race'  # Replace with your actual column name if different

    if protected_attribute_name not in data_valid_df.columns:
        data_valid_df[protected_attribute_name] = data_valid.protected_attributes[:, 0]
    if protected_attribute_name not in data_test_df.columns:
        data_test_df[protected_attribute_name] = data_test.protected_attributes[:, 0]

    # Loop through bound values
    for i in all_bound:

        # Feedback
        print('-- BOUND ' + str(i) + '...')

        # Placeholder
        ROC_test = pd.DataFrame()

        # Loop through base classifiers
        for m in model_names:

            # Extract validation preds
            scores_valid = np.array(dataset_trainResults_valid[m + '_scores']).reshape(len(dataset_trainResults_valid.index), 1)
            labels_valid = np.where(dataset_trainResults_valid[m + '_class'] == 'Parole', 1.0, 0.0).reshape(len(dataset_trainResults_valid.index), 1)

            # Extract test preds
            scores_test = np.array(dataset_trainResults_test[m + '_scores']).reshape(len(dataset_trainResults_test.index), 1)
            labels_test = np.where(dataset_trainResults_test[m + '_class'] == 'Parole', 1.0, 0.0).reshape(len(dataset_trainResults_test.index), 1)

            # Add predictions as columns to DataFrame
            data_valid_df['scores'] = scores_valid
            data_valid_df['labels'] = labels_valid
            data_test_df['scores'] = scores_test
            data_test_df['labels'] = labels_test

            # Create BinaryLabelDataset
            dataset_valid_bl = BinaryLabelDataset(df=data_valid_df,
                                                  label_names=['labels'],
                                                  protected_attribute_names=[protected_attribute_name],
                                                  favorable_label=1.0,
                                                  unfavorable_label=0.0)
            dataset_test_bl = BinaryLabelDataset(df=data_test_df,
                                                 label_names=['labels'],
                                                 protected_attribute_names=[protected_attribute_name],
                                                 favorable_label=1.0,
                                                 unfavorable_label=0.0)

            # Fit ROC with error handling
            try:
                ROC = RejectOptionClassification(unprivileged_groups=unprivileged_groups,
                                                 privileged_groups=privileged_groups,
                                                 num_class_thresh=num_class_thresh,
                                                 num_ROC_margin=num_ROC_margin,
                                                 metric_name=metric_name,
                                                 metric_ub=i,
                                                 metric_lb=-i)
                ROC = ROC.fit(dataset_valid_bl, dataset_valid_bl)
                
                # Predict test scores
                dataset_transf_test_pred = ROC.predict(dataset_test_bl)
                ROC_test[m + "_fairScores"] = dataset_transf_test_pred.scores.flatten()
                label_names = np.where(dataset_transf_test_pred.labels == 1, 'Parole', 'NoParole')
                ROC_test[m + "_fairLabels"] = label_names

            except Exception as e:
                print(f"Error for fold {fold}, bound {i}, model {m}: {e}")
                continue

        # Export CSV
        ROC_test.to_csv(os.path.join(output_dir, data + '_' + str(fold) + '_ROC_' + str(i) + '_predictions_test.csv'), index=None, header=True)

    # Feedback
    print('')

# Print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))