In [None]:
import os

os.chdir('../scripts') # cd into scripts dir

from scan import *
from prep import *
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression

os.chdir('../') # cd back to parent dir
home_dir = "compas/" # set home dir for reading and storing data

# Read recipe inputs
compas_prep_df = pd.read_csv(home_dir + "datasets/compas_prep.csv")
compas_prep_df

### Prepare Base Dataset for Simulations

Select only covariates and outcomes to prepare for simulation.

In [None]:
# Exclude risk score
prob_cols = ['compas_risk_score', 'proba_lr']
compas_sim_df = compas_prep_df.drop(columns=prob_cols)

### Build Logistic Regression Model to Predict Outcomes

One-Hot encode categorical features.

In [None]:
### Columns to exclude from model
exclude = ['race', 'under_25']

In [None]:
compas_df = compas_sim_df.copy(deep=True)
compas_df = compas_df.drop(columns=exclude)
compas_df

In [None]:
for cat_col in compas_df.select_dtypes(include=['object', 'bool']).columns:
    compas_df[cat_col] = compas_df[cat_col].astype('category')

In [None]:
X_num = compas_df.select_dtypes(exclude=['category'])
X_cat = compas_df.select_dtypes(include=['category'])

In [None]:
X_cat

In [None]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
compas_df = pd.concat(frames, axis=1)

In [None]:
extra_cols = ['sex_Female', 'prior_offenses_None', 'charge_degree_M']
compas_df = compas_df.drop(columns=extra_cols)
compas_df

In [None]:
X = compas_df.drop(columns=['outcomes'])
y = compas_df['outcomes']

In [None]:
lr = LogisticRegression(random_state=0, solver='lbfgs')
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
compas_sim_df['proba'] = proba
compas_sim_df

In [None]:
# Check accuracy
# Note: model results can vary environment to environment, which can impact IJDI scores.
compas_sim_df['test_outcomes'] = compas_sim_df['proba'].apply(lambda x : 1 if x > 0.5 else 0)
compas_sim_df['check'] = compas_sim_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_sim_df['check']) / len(compas_sim_df))
compas_sim_df = compas_sim_df.drop(columns=['test_outcomes', 'check'])

### Example of one Iteration for Base Case of the Simulation 2

Define subgroup for which to simulate altered probabilities.

In [None]:
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment
subgroup = {'race':['African-American', 'Asian'], 'under_25':[True]}
compas_sim_df['in_subgroup'] = compas_sim_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
compas_sim_df

In [None]:
THRESHOLD = 0.5
proba_epsilon = 0.3
threshold_epsilon = 0.3
compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], proba_epsilon) if x['in_subgroup']
                                                                                                                 else x['proba'], axis=1)
compas_sim_df['threshold'] = THRESHOLD
compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -threshold_epsilon) if x['in_subgroup']
                                                                                                                              else x['threshold'], axis=1)
compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))
compas_sim_df

Generate variant metrics for only "truly negative-outcome" people (those who did NOT reoffend). Additionally, generate metrics required for IJDI scan (i.e. $\hat{p}$ calculation).

In [None]:
# specify probability and outcomes columns
OUTCOMES_COL = 'outcomes'
FEATURES = ['sex', 'race', 'under_25', 'charge_degree']
LAMBDA_PARAM = 10
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# filter for only negative outcomes
negatives_df = compas_sim_df.loc[compas_sim_df[OUTCOMES_COL] == 0]

In [None]:
# # define new columns
# metrics_df = generate_metrics(negatives_df, 'proba_shifted', OUTCOMES_COL, THRESHOLD, constant_threshold=False)

# print("Number of people in subgroup:", len(negatives_df[negatives_df['in_subgroup'] == True]))
# print("Number of people not in subgroup:", len(negatives_df[negatives_df['in_subgroup'] == False]))
# print("Total population:", len(negatives_df))

# # display preview of treatments, probabilities, and outcomes
# metrics_df.head(10)

In [None]:
# scan with shifted probabilities
current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, 'proba', 'proba', OUTCOMES_COL,
                                              'threshold_shifted', LAMBDA_PARAM, constant_threshold=False)
summarize_scan(negatives_df, FEATURES, 'proba', OUTCOMES_COL, current_subset, include='all')

In [None]:
# scan with shifted thresholds
current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, 'proba_shifted', 'proba', OUTCOMES_COL,
                                              'threshold', LAMBDA_PARAM, constant_threshold=False)
summarize_scan(negatives_df, FEATURES, 'proba', OUTCOMES_COL, current_subset, include='all')

### Simulations for Negatives

Set mean values as well as ranges of standard deviation and lambda to iterate over for each simulation. Also define threshold and number of iterations to run for each set of parameters.

In [None]:
epsilon_vals = [0, 0.3, 3]
lambda_vals = [0, 0.3, 1, 3, 10]
n_iters = 1

# specify parameters for generating metrics and IJDI scan
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree']
threshold = 0.5
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [None]:
sim_data = []

for e in epsilon_vals:

    print("Parameter Epsilon =", e)

    for i in range(n_iters): # run n iterations for each k value

        print("Simulation", i+1, "of", n_iters)

        compas_sim_df['proba_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['proba'], e) if x['in_subgroup']
                                                                                                             else x['proba'], axis=1)
        compas_sim_df['threshold'] = threshold
        compas_sim_df['threshold_shifted'] = compas_sim_df.apply(lambda x : calculate_proba_shift(x['threshold'], -e) if x['in_subgroup']
                                                                                                                      else x['threshold'], axis=1)
        compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))

        # filter for only negative outcomes
        negatives_df = compas_sim_df.loc[compas_sim_df[outcomes_col] == 0]

        for lambda_param in lambda_vals: # run IJDI scan for various lambda values

            print("Lambda =", lambda_param)

            # Run IJDI Scan. Make sure to pass in copy because data may be modified by the function!
            proba_shift_subset, proba_shift_score = run_ijdi_scan(negatives_df.copy(deep=True), features, 'proba_shifted', 'proba', outcomes_col,
                                                                  'threshold', lambda_param, constant_threshold=False, verbose=True)
            threshold_shift_subset, threshold_shift_score = run_ijdi_scan(negatives_df.copy(deep=True), features, 'proba', 'proba', outcomes_col,
                                                                          'threshold_shifted', lambda_param, constant_threshold=False, verbose=True)

            # save iou and score
            if proba_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_proba_shift_subset = negatives_df[list(proba_shift_subset.keys())].isin(proba_shift_subset).all(axis=1)
                proba_shift_iou = (in_subgroup & in_proba_shift_subset).sum() / (in_subgroup | in_proba_shift_subset).sum()
            else:
                proba_shift_iou = 0.0

            print("Detected proba shift subset intersection over union with expected subset:", proba_shift_iou)
            print("Detected proba shift subset score:", proba_shift_score)

            if threshold_shift_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_threshold_shift_subset = negatives_df[list(threshold_shift_subset.keys())].isin(threshold_shift_subset).all(axis=1)
                threshold_shift_iou = (in_subgroup & in_threshold_shift_subset).sum() / (in_subgroup | in_threshold_shift_subset).sum()
            else:
                threshold_shift_iou = 0.0

            print("Detected threshold shift subset intersection over union with expected subset:", threshold_shift_iou)
            print("Detected threshold shift subset score:", threshold_shift_score)

            # append data
            sim_row = [e, lambda_param, proba_shift_iou, proba_shift_score, threshold_shift_iou, threshold_shift_score]
            sim_data.append(sim_row)

            print(sim_row)
            print("\n----------------------------------------------------\n")

In [None]:
columns = ['e', 'lambda', 'proba_shift_iou', 'proba_shift_score', 'threshold_shift_iou', 'threshold_shift_score']
sim_result_df = pd.DataFrame(sim_data, columns=columns)
sim_result_df

In [None]:
# Write recipe outputs
sim_result_df.to_csv(home_dir + "datasets/compas_sim_2_neg.csv")