In [1]:
import os

os.chdir('../scripts') # cd into scripts dir

from scan import *
from prep import *
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression

os.chdir('../') # cd back to parent dir
home_dir = "compas/" # set home dir for reading and storing data

# Read recipe inputs
compas_prep_df = pd.read_csv(home_dir + "datasets/compas_prep.csv")
compas_prep_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas,proba_lr
0,0,Male,Other,,False,F,0,1,0.213889,0.232134
1,1,Male,African-American,,False,F,1,3,0.376171,0.289774
2,2,Male,African-American,1 to 5,True,F,1,4,0.434330,0.668642
3,3,Male,African-American,1 to 5,True,F,0,8,0.683594,0.668642
4,4,Male,Other,1 to 5,False,F,0,1,0.213889,0.373653
...,...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,7,0.591216,0.505581
7210,7210,Male,African-American,,True,F,0,3,0.376171,0.505581
7211,7211,Male,Other,,False,F,0,1,0.213889,0.232134
7212,7212,Female,African-American,1 to 5,False,M,0,2,0.311371,0.335035


### Prepare Base Dataset for Simulations

Select only covariates and outcomes to prepare for simulation.

In [2]:
# Exclude risk score
prob_cols = ['compas_risk_score', 'proba_lr']
compas_sim_df = compas_prep_df.drop(columns=prob_cols)

### Build Logistic Regression Model to Predict Outcomes

One-Hot encode categorical features.

In [3]:
### Columns to exclude from model
exclude = ['race', 'under_25']

In [4]:
compas_df = compas_sim_df.copy(deep=True)
compas_df = compas_df.drop(columns=exclude)
compas_df

Unnamed: 0.1,Unnamed: 0,sex,prior_offenses,charge_degree,outcomes,proba_compas
0,0,Male,,F,0,0.213889
1,1,Male,,F,1,0.376171
2,2,Male,1 to 5,F,1,0.434330
3,3,Male,1 to 5,F,0,0.683594
4,4,Male,1 to 5,F,0,0.213889
...,...,...,...,...,...,...
7209,7209,Male,,F,0,0.591216
7210,7210,Male,,F,0,0.376171
7211,7211,Male,,F,0,0.213889
7212,7212,Female,1 to 5,M,0,0.311371


In [5]:
for cat_col in compas_df.select_dtypes(include=['object', 'bool']).columns:
    compas_df[cat_col] = compas_df[cat_col].astype('category')

In [6]:
X_num = compas_df.select_dtypes(exclude=['category'])
X_cat = compas_df.select_dtypes(include=['category'])

In [7]:
X_cat

Unnamed: 0,sex,prior_offenses,charge_degree
0,Male,,F
1,Male,,F
2,Male,1 to 5,F
3,Male,1 to 5,F
4,Male,1 to 5,F
...,...,...,...
7209,Male,,F
7210,Male,,F
7211,Male,,F
7212,Female,1 to 5,M


In [8]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
compas_df = pd.concat(frames, axis=1)

In [9]:
extra_cols = ['sex_Female', 'prior_offenses_None', 'charge_degree_M']
compas_df = compas_df.drop(columns=extra_cols)
compas_df

Unnamed: 0.1,sex_Male,prior_offenses_1 to 5,prior_offenses_Over 5,charge_degree_F,Unnamed: 0,outcomes,proba_compas
0,1,0,0,1,0,0,0.213889
1,1,0,0,1,1,1,0.376171
2,1,1,0,1,2,1,0.434330
3,1,1,0,1,3,0,0.683594
4,1,1,0,1,4,0,0.213889
...,...,...,...,...,...,...,...
7209,1,0,0,1,7209,0,0.591216
7210,1,0,0,1,7210,0,0.376171
7211,1,0,0,1,7211,0,0.213889
7212,0,1,0,0,7212,0,0.311371


In [10]:
X = compas_df.drop(columns=['outcomes'])
y = compas_df['outcomes']

In [11]:
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
compas_sim_df['proba'] = proba
compas_sim_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,proba_compas,proba
0,0,Male,Other,,False,F,0,0.213889,0.500000
1,1,Male,African-American,,False,F,1,0.376171,0.499991
2,2,Male,African-American,1 to 5,True,F,1,0.434330,0.499981
3,3,Male,African-American,1 to 5,True,F,0,0.683594,0.499972
4,4,Male,Other,1 to 5,False,F,0,0.213889,0.499963
...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,0.591216,0.433207
7210,7210,Male,African-American,,True,F,0,0.376171,0.433198
7211,7211,Male,Other,,False,F,0,0.213889,0.433189
7212,7212,Female,African-American,1 to 5,False,M,0,0.311371,0.433180


In [12]:
compas_sim_df.loc[compas_sim_df['prior_offenses'] == 'Over 5']['proba'].mean()

0.46639070699416457

In [13]:
compas_sim_df.loc[compas_sim_df['prior_offenses'] != 'Over 5']['proba'].mean()

0.46651013936038643

In [14]:
# Check accuracy
compas_sim_df['test_outcomes'] = compas_sim_df['proba'].apply(lambda x : 1 if x > 0.5 else 0)
compas_sim_df['check'] = compas_sim_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_sim_df['check']) / len(compas_sim_df))
compas_sim_df = compas_sim_df.drop(columns=['test_outcomes', 'check'])

0.5493484890490713


### Example of one Iteration for Base Case of the Simulation 2

Define subgroup for which to simulate altered probabilities.

In [15]:
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment
subgroup = {'race':['African-American', 'Asian'], 'under_25':[True]}
compas_sim_df['in_subgroup'] = compas_sim_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
compas_sim_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,proba_compas,proba,in_subgroup
0,0,Male,Other,,False,F,0,0.213889,0.500000,False
1,1,Male,African-American,,False,F,1,0.376171,0.499991,False
2,2,Male,African-American,1 to 5,True,F,1,0.434330,0.499981,True
3,3,Male,African-American,1 to 5,True,F,0,0.683594,0.499972,True
4,4,Male,Other,1 to 5,False,F,0,0.213889,0.499963,False
...,...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,0.591216,0.433207,True
7210,7210,Male,African-American,,True,F,0,0.376171,0.433198,True
7211,7211,Male,Other,,False,F,0,0.213889,0.433189,False
7212,7212,Female,African-American,1 to 5,False,M,0,0.311371,0.433180,False


In [16]:
THRESHOLD = 0.5
proba_epsilon = 0.3
threshold_epsilon = 0.3
compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], proba_epsilon) if x['in_subgroup']
                                                                                                                 else x['proba'], axis=1)
compas_sim_df['threshold'] = THRESHOLD
compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -threshold_epsilon) if x['in_subgroup']
                                                                                                                              else x['threshold'], axis=1)
compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))
compas_sim_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,proba_compas,proba,in_subgroup,proba_shifted,threshold,threshold_shifted
0,0,Male,Other,,False,F,0,0.213889,0.500000,False,0.500000,0.5,0.500000
1,1,Male,African-American,,False,F,1,0.376171,0.499991,False,0.499991,0.5,0.500000
2,2,Male,African-American,1 to 5,True,F,0,0.434330,0.499981,True,0.574424,0.5,0.425557
3,3,Male,African-American,1 to 5,True,F,1,0.683594,0.499972,True,0.574415,0.5,0.425557
4,4,Male,Other,1 to 5,False,F,0,0.213889,0.499963,False,0.499963,0.5,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,0.591216,0.433207,True,0.507805,0.5,0.425557
7210,7210,Male,African-American,,True,F,1,0.376171,0.433198,True,0.507795,0.5,0.425557
7211,7211,Male,Other,,False,F,0,0.213889,0.433189,False,0.433189,0.5,0.500000
7212,7212,Female,African-American,1 to 5,False,M,0,0.311371,0.433180,False,0.433180,0.5,0.500000


Generate variant metrics for only "truly negative-outcome" people (those who did NOT reoffend). Additionally, generate metrics required for IJDI scan (i.e. $\hat{p}$ calculation).

In [17]:
# specify probability and outcomes columns
OUTCOMES_COL = 'outcomes'
FEATURES = ['sex', 'race', 'under_25', 'charge_degree']
LAMBDA_PARAM = 3
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# filter for only negative outcomes
negatives_df = compas_sim_df.loc[compas_sim_df[OUTCOMES_COL] == 0]

In [18]:
# # define new columns
# metrics_df = generate_metrics(negatives_df, 'proba_shifted', OUTCOMES_COL, THRESHOLD, constant_threshold=False)

# print("Number of people in subgroup:", len(negatives_df[negatives_df['in_subgroup'] == True]))
# print("Number of people not in subgroup:", len(negatives_df[negatives_df['in_subgroup'] == False]))
# print("Total population:", len(negatives_df))

# # display preview of treatments, probabilities, and outcomes
# metrics_df.head(10)

In [19]:
# scan with shifted probabilities
current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, 'proba', 'proba', OUTCOMES_COL,
                                              'threshold_shifted', LAMBDA_PARAM, constant_threshold=False)
summarize_scan(negatives_df, FEATURES, 'proba', OUTCOMES_COL, current_subset, include='all')

Subset found on iteration 1 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Best score is now 1066.722961750935
Subset found on iteration 2 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 3 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 4 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 5 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on

In [20]:
# scan with shifted thresholds
current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, 'proba_shifted', 'proba', OUTCOMES_COL,
                                              'threshold', LAMBDA_PARAM, constant_threshold=False)
summarize_scan(negatives_df, FEATURES, 'proba', OUTCOMES_COL, current_subset, include='all')

Subset found on iteration 1 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Best score is now 1066.722961750935
Subset found on iteration 2 of 10 with score 1066.722961750935 :
{'under_25': [True], 'race': ['African-American', 'Asian']}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 3 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 4 of 10 with score 1066.722961750935 :
{'under_25': [True], 'race': ['African-American', 'Asian']}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on iteration 5 of 10 with score 1066.722961750935 :
{'race': ['African-American', 'Asian'], 'under_25': [True]}
Current score of 1066.722961750935 does not beat best score of 1066.722961750935
Subset found on

### Simulations for Negatives

Set mean values as well as ranges of standard deviation and lambda to iterate over for each simulation. Also define threshold and number of iterations to run for each set of parameters.

In [21]:
epsilon_vals = [0, 0.3, 3]
lambda_vals = [0, 0.3, 1, 3, 10]
n_iters = 1

# specify parameters for generating metrics and IJDI scan
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
threshold = 0.5
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [None]:
sim_data = []

for e in epsilon_vals:

    print("Parameter Epsilon =", e)

    for i in range(n_iters): # run n iterations for each k value

        print("Simulation", i+1, "of", n_iters)

        compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], e) if x['in_subgroup']
                                                                                                             else x['proba'], axis=1)
        compas_sim_df['threshold'] = threshold
        compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -e) if x['in_subgroup']
                                                                                                                      else x['threshold'], axis=1)
        compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))

        # filter for only negative outcomes
        negatives_df = compas_sim_df.loc[compas_sim_df[outcomes_col] == 0]

        for lambda_param in lambda_vals: # run IJDI scan for various lambda values

            print("Lambda =", lambda_param)

            # Run IJDI Scan. Make sure to pass in copy because data may be modified by the function!
            proba_shift_subset, proba_shift_score = run_ijdi_scan(negatives_df.copy(deep=True), features, 'proba_shifted', 'proba', outcomes_col,
                                                                  'threshold', lambda_param, constant_threshold=False, verbose=True)
            threshold_shift_subset, threshold_shift_score = run_ijdi_scan(negatives_df.copy(deep=True), features, 'proba', 'proba', outcomes_col,
                                                                          'threshold_shifted', lambda_param, constant_threshold=False, verbose=True)

            # save iou and score
            if proba_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_proba_shift_subset = negatives_df[list(proba_shift_subset.keys())].isin(proba_shift_subset).all(axis=1)
                proba_shift_iou = (in_subgroup & in_proba_shift_subset).sum() / (in_subgroup | in_proba_shift_subset).sum()
            else:
                proba_shift_iou = 0.0

            print("Detected proba shift subset intersection over union with expected subset:", proba_shift_iou)
            print("Detected proba shift subset score:", proba_shift_score)

            if threshold_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_threshold_shift_subset = negatives_df[list(threshold_shift_subset.keys())].isin(threshold_shift_subset).all(axis=1)
                threshold_shift_iou = (in_subgroup & in_threshold_shift_subset).sum() / (in_subgroup | in_threshold_shift_subset).sum()
            else:
                threshold_shift_iou = 0.0

            print("Detected threshold shift subset intersection over union with expected subset:", threshold_shift_iou)
            print("Detected threshold shift subset score:", threshold_shift_score)

            # append data
            sim_row = [e, lambda_param, proba_shift_iou, proba_shift_score, threshold_shift_iou, threshold_shift_score]
            sim_data.append(sim_row)

            print(sim_row)
            print("\n----------------------------------------------------\n")

Parameter Epsilon = 0
Simulation 1 of 1
Lambda = 0
Subset found on iteration 1 of 10 with score 0.0 :
{}
Best score is now 0.0
Subset found on iteration 2 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 3 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 4 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 5 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 6 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 7 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 8 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on iteration 9 of 10 with score 0.0 :
{}
Current score of 0.0 does not beat best score of 0.0
Subset found on i

In [None]:
columns = ['e', 'lambda', 'proba_shift_iou', 'proba_shift_score', 'threshold_shift_iou', 'threshold_shift_score']
sim_result_df = pd.DataFrame(sim_data, columns=columns)
sim_result_df

In [None]:
# Write recipe outputs
sim_result_df.to_csv(home_dir + "datasets/compas_sim_2_neg.csv")