In [1]:
import os

os.chdir('../scripts') # cd into scripts dir

from scan import *
from prep import *
from model import *
import pandas as pd, numpy as np

os.chdir('../') # cd back to parent dir
home_dir = "compas/" # set home dir for reading and storing data

# Read recipe inputs
compas_prep_df = pd.read_csv(home_dir + "datasets/compas_prep.csv")
compas_prep_df = compas_prep_df.drop(columns=["Unnamed: 0"])
compas_prep_df

Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas,proba_lr
0,Male,Other,,False,F,0,1,0.213889,0.232134
1,Male,African-American,,False,F,1,3,0.376171,0.289774
2,Male,African-American,1 to 5,True,F,1,4,0.434330,0.668642
3,Male,African-American,1 to 5,True,F,0,8,0.683594,0.668642
4,Male,Other,1 to 5,False,F,0,1,0.213889,0.373653
...,...,...,...,...,...,...,...,...,...
7209,Male,African-American,,True,F,0,7,0.591216,0.505581
7210,Male,African-American,,True,F,0,3,0.376171,0.505581
7211,Male,Other,,False,F,0,1,0.213889,0.232134
7212,Female,African-American,1 to 5,False,M,0,2,0.311371,0.335035


### Prepare Base Dataset for Simulations

Select only covariates and outcomes to prepare for simulation.

In [2]:
prob_cols = ['compas_risk_score', 'proba_lr']
compas_sim_df = compas_prep_df.drop(columns=prob_cols)

### Build Logistic Regression Model to Predict Outcomes

In [13]:
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
exclude = []

# Build model and predict
compas_sim_df['proba'] = lr_build_and_predict(compas_sim_df[features+[outcomes_col]], exclude)

In [14]:
# Check accuracy
check_accuracy(compas_sim_df)

0.6606598281120044

In [19]:
# THRESHOLD = 0.5
# proba_epsilon = 0.3
# threshold_epsilon = 0.3
# compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], proba_epsilon) if x['in_subgroup']
#                                                                                                                  else x['proba'], axis=1)
# compas_sim_df['threshold'] = THRESHOLD
# compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -threshold_epsilon) if x['in_subgroup']
#                                                                                                                               else x['threshold'], axis=1)
# compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))
# compas_sim_df

### Simulations for Negatives

Set mean values as well as ranges of standard deviation and lambda to iterate over for each simulation. Also define threshold and number of iterations to run for each set of parameters.

In [16]:
epsilon_vals = [0, 0.3, 3]
lambda_vals = [0, 0.3, 1, 3, 10]
n_iters = 10

# specify parameters for generating metrics and IJDI scan
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
threshold = 0.5
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(300)

In [17]:
sim_data = []

for e in epsilon_vals:

    print("Parameter Epsilon =", e)

    for i in range(n_iters): # run n iterations for each k value

        print("Simulation", i+1, "of", n_iters)
        
        # define subgroup for which to simulate altered probabilities and thresholds
        protected = pick_random_protected_group(compas_sim_df, features)
        subgroup = {protected[0]:[protected[1]]}
        print("Protected Group:", subgroup)
        compas_sim_df['in_subgroup'] = compas_sim_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
        
        exclude = []
        compas_sim_df['proba'] = lr_build_and_predict(compas_sim_df[features+[outcomes_col]], exclude)

        # calculate shifts and generate outcomes
        compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], e) if x['in_subgroup']
                                                                                                             else x['proba'], axis=1)
        compas_sim_df['threshold'] = threshold
        compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -e) if x['in_subgroup']
                                                                                                                      else x['threshold'], axis=1)
        compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))

        # filter for only negative outcomes
        negatives_df = compas_sim_df.loc[compas_sim_df[outcomes_col] == 0]

        for lambda_param in lambda_vals: # run IJDI scan for various lambda values

            print("Lambda =", lambda_param)
            
            negatives_df_copy_1 = negatives_df.copy(deep=True)
            negatives_df_copy_2 = negatives_df.copy(deep=True)

            # Run IJDI Scan. Make sure to pass in copy because data may be modified by the function!
            proba_shift_subset, proba_shift_score = run_ijdi_scan(negatives_df_copy_1, features, 'proba_shifted', 'proba', outcomes_col,
                                                                  'threshold', lambda_param, constant_threshold=False, verbose=False)
            threshold_shift_subset, threshold_shift_score = run_ijdi_scan(negatives_df_copy_2, features, 'proba', 'proba', outcomes_col,
                                                                          'threshold_shifted', lambda_param, constant_threshold=False, verbose=False)

            # save iou and score
            if proba_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_proba_shift_subset = negatives_df[list(proba_shift_subset.keys())].isin(proba_shift_subset).all(axis=1)
                proba_shift_iou = (in_subgroup & in_proba_shift_subset).sum() / (in_subgroup | in_proba_shift_subset).sum()
            else:
                proba_shift_iou = 0.0

            print("Detected proba shift subset intersection over union with expected subset:", proba_shift_iou)
            print("Detected proba shift subset score:", proba_shift_score)

            if threshold_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_threshold_shift_subset = negatives_df[list(threshold_shift_subset.keys())].isin(threshold_shift_subset).all(axis=1)
                threshold_shift_iou = (in_subgroup & in_threshold_shift_subset).sum() / (in_subgroup | in_threshold_shift_subset).sum()
            else:
                threshold_shift_iou = 0.0

            print("Detected threshold shift subset intersection over union with expected subset:", threshold_shift_iou)
            print("Detected threshold shift subset score:", threshold_shift_score)

            # append data
            sim_row = [e, lambda_param, proba_shift_iou, proba_shift_score, threshold_shift_iou, threshold_shift_score]
            sim_data.append(sim_row)
            print(sim_row)
                        
            # release memory from kernel
            release_df(negatives_df_copy_1)
            release_df(negatives_df_copy_2)
            
            print("\n----------------------------------------------------\n")

Parameter Epsilon = 0
Simulation 1 of 10
Protected Group: {'race': ['African-American']}
Lambda = 0
Required time =  19.34727454185486 seconds
Found positive subset for ijdi scan:
{'race': ['African-American', 'Caucasian', 'Hispanic', 'Native American', 'Other'], 'prior_offenses': ['Over 5']}
Score: 759.7133026357654
First Iteration
Average p_delta: 0.2788845247692703
Average p_censor: 0.0
p(S): 0.6784379527317607 p(~S): 0.35938323828522684 E[censored] 0.22188980304163433 E[uncensored] 0.22188980304163433
Subset does not violate p_delta or p_censor conditions!
Required time =  18.937634706497192 seconds
Found positive subset for ijdi scan:
{'race': ['African-American', 'Caucasian', 'Hispanic', 'Native American', 'Other'], 'prior_offenses': ['Over 5']}
Score: 759.7133026357654
First Iteration
Average p_delta: 0.2788845247692703
Average p_censor: 0.0
p(S): 0.6784379527317607 p(~S): 0.35938323828522684 E[censored] 0.22188980304163433 E[uncensored] 0.22188980304163433
Subset does not viola

KeyboardInterrupt: 

In [None]:
columns = ['e', 'lambda', 'proba_shift_iou', 'proba_shift_score', 'threshold_shift_iou', 'threshold_shift_score']
sim_result_df = pd.DataFrame(sim_data, columns=columns)
sim_result_df

In [None]:
# Write recipe outputs
sim_result_df.to_csv(home_dir + "datasets/compas_sim_2_neg.csv")