### Import Libraries

In [1]:
import os

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression

# Read recipe inputs
compas_df = pd.read_csv("compas.csv")
compas_df

Unnamed: 0,Sex,Race,Prior Offenses,Under 25,ChargeDegree,ReoffendedWithinTwoYears,COMPASPredictedDecileScore
0,Male,Other,,False,F,0,1
1,Male,African-American,,False,F,1,3
2,Male,African-American,1 to 5,True,F,1,4
3,Male,African-American,1 to 5,True,F,0,8
4,Male,Other,1 to 5,False,F,0,1
...,...,...,...,...,...,...,...
7209,Male,African-American,,True,F,0,7
7210,Male,African-American,,True,F,0,3
7211,Male,Other,,False,F,0,1
7212,Female,African-American,1 to 5,False,M,0,2


### Data Preparation

Summarize all columns and preview the dataset.

In [2]:
print("Summary of all columns of the dataset:")
for col in compas_df.columns:
    print("\n")
    print(compas_df.loc[:,col].value_counts())
compas_df.head(10)

Summary of all columns of the dataset:


Male      5819
Female    1395
Name: Sex, dtype: int64


African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: Race, dtype: int64


1 to 5    3540
None      2150
Over 5    1524
Name: Prior Offenses, dtype: int64


False    5685
True     1529
Name: Under 25, dtype: int64


F    4666
M    2548
Name: ChargeDegree, dtype: int64


0    3963
1    3251
Name: ReoffendedWithinTwoYears, dtype: int64


1     1440
2      941
4      769
3      747
5      681
6      641
7      592
8      512
9      508
10     383
Name: COMPASPredictedDecileScore, dtype: int64


Unnamed: 0,Sex,Race,Prior Offenses,Under 25,ChargeDegree,ReoffendedWithinTwoYears,COMPASPredictedDecileScore
0,Male,Other,,False,F,0,1
1,Male,African-American,,False,F,1,3
2,Male,African-American,1 to 5,True,F,1,4
3,Male,African-American,1 to 5,True,F,0,8
4,Male,Other,1 to 5,False,F,0,1
5,Male,Other,,False,M,0,1
6,Male,Caucasian,Over 5,False,F,1,6
7,Male,Other,1 to 5,False,F,0,4
8,Female,Caucasian,,False,M,0,1
9,Male,Caucasian,1 to 5,True,F,1,3


In [3]:
# use treatment columns
col_map = {'Sex':'sex',
           'Race':'race',
           'Prior Offenses':'prior_offenses',
           'Under 25':'under_25',
           'ChargeDegree':'charge_degree',
           'COMPASPredictedDecileScore':'compas_risk_score',
           'ReoffendedWithinTwoYears':'outcomes'}
compas_prep_df = compas_df.rename(columns=col_map)
compas_prep_df.head(10)

Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score
0,Male,Other,,False,F,0,1
1,Male,African-American,,False,F,1,3
2,Male,African-American,1 to 5,True,F,1,4
3,Male,African-American,1 to 5,True,F,0,8
4,Male,Other,1 to 5,False,F,0,1
5,Male,Other,,False,M,0,1
6,Male,Caucasian,Over 5,False,F,1,6
7,Male,Other,1 to 5,False,F,0,4
8,Female,Caucasian,,False,M,0,1
9,Male,Caucasian,1 to 5,True,F,1,3


1) Map COMPAS decile scores to probability of reoffending using Maximum Likelihood Estimator (reoffending rate for the corresponding group).

In [4]:
# probability mapping 1
probability_mapping_1 = compas_prep_df.groupby('compas_risk_score').mean()['outcomes'].to_dict()
probs_1_df = compas_prep_df['compas_risk_score'].map(probability_mapping_1)
print("MLE probability estimates for each COMPAS risk score:")
print(probability_mapping_1)
compas_prep_df['proba_compas'] = probs_1_df
compas_prep_df

MLE probability estimates for each COMPAS risk score:
{1: 0.21388888888888888, 2: 0.31137088204038255, 3: 0.37617135207496655, 4: 0.4343302990897269, 5: 0.4787077826725404, 6: 0.5585023400936038, 7: 0.5912162162162162, 8: 0.68359375, 9: 0.6988188976377953, 10: 0.7728459530026109}


Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas
0,Male,Other,,False,F,0,1,0.213889
1,Male,African-American,,False,F,1,3,0.376171
2,Male,African-American,1 to 5,True,F,1,4,0.434330
3,Male,African-American,1 to 5,True,F,0,8,0.683594
4,Male,Other,1 to 5,False,F,0,1,0.213889
...,...,...,...,...,...,...,...,...
7209,Male,African-American,,True,F,0,7,0.591216
7210,Male,African-American,,True,F,0,3,0.376171
7211,Male,Other,,False,F,0,1,0.213889
7212,Female,African-American,1 to 5,False,M,0,2,0.311371


2) Map decile scores and prior offenses to probability of reoffending using MLE (unused).

In [5]:
# probabilitiy mapping 2
# probability_mapping_2 = compas_df.groupby(['COMPASPredictedDecileScore','Prior Offenses']).mean()['ReoffendedWithinTwoYears'].to_dict()
# probs_2_df = pd.Series(list(zip(compas_df['COMPASPredictedDecileScore'], compas_df['Prior Offenses']))).map(probability_mapping_2)
# print("MLE probability estimates for each COMPAS risk score and prior offenses:")
# print(probability_mapping_2)

Build a "True" Model trained on the entire dataset's features and outcomes.

### Build Logistic Regression on Whole Dataset to Model "True Probabilities"

Select only covariates and outcomes to prepare for simulation.

In [6]:
# Exclude risk score
prob_cols = ['compas_risk_score', 'proba_compas']
compas_model_df = compas_prep_df.drop(columns=prob_cols)

One-Hot encode categorical features.

In [7]:
for cat_col in compas_model_df.select_dtypes(include=['object', 'bool']).columns:
    compas_model_df[cat_col] = compas_model_df[cat_col].astype('category')

In [8]:
X_num = compas_model_df.select_dtypes(exclude=['category'])
X_cat = compas_model_df.select_dtypes(include=['category'])

In [9]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
compas_model_df = pd.concat(frames, axis=1)
compas_model_df

Unnamed: 0,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,prior_offenses_1 to 5,prior_offenses_None,prior_offenses_Over 5,under_25_False,under_25_True,charge_degree_F,charge_degree_M,outcomes
0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0
1,0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,1
2,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,1
3,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0
4,0,1,0,0,0,0,0,1,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0
7210,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0
7211,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0
7212,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0


In [10]:
extra_cols = ['sex_Female', 'prior_offenses_None', 'charge_degree_M', 'under_25_False']
compas_model_df = compas_model_df.drop(columns=extra_cols)
compas_model_df

Unnamed: 0,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,prior_offenses_1 to 5,prior_offenses_Over 5,under_25_True,charge_degree_F,outcomes
0,1,0,0,0,0,0,1,0,0,0,1,0
1,1,1,0,0,0,0,0,0,0,0,1,1
2,1,1,0,0,0,0,0,1,0,1,1,1
3,1,1,0,0,0,0,0,1,0,1,1,0
4,1,0,0,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1,1,0,0,0,0,0,0,0,1,1,0
7210,1,1,0,0,0,0,0,0,0,1,1,0
7211,1,0,0,0,0,0,1,0,0,0,1,0
7212,0,1,0,0,0,0,0,1,0,0,0,0


In [11]:
X = compas_model_df.drop(columns=['outcomes'])
y = compas_model_df['outcomes']

In [12]:
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
compas_model_df['proba_lr'] = proba
compas_prep_df['proba_lr'] = proba
compas_prep_df

Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas,proba_lr
0,Male,Other,,False,F,0,1,0.213889,0.232134
1,Male,African-American,,False,F,1,3,0.376171,0.289774
2,Male,African-American,1 to 5,True,F,1,4,0.434330,0.668642
3,Male,African-American,1 to 5,True,F,0,8,0.683594,0.668642
4,Male,Other,1 to 5,False,F,0,1,0.213889,0.373653
...,...,...,...,...,...,...,...,...,...
7209,Male,African-American,,True,F,0,7,0.591216,0.505581
7210,Male,African-American,,True,F,0,3,0.376171,0.505581
7211,Male,Other,,False,F,0,1,0.213889,0.232134
7212,Female,African-American,1 to 5,False,M,0,2,0.311371,0.335035


In [13]:
# Check accuracy of COMPAS predictions
compas_model_df['test_outcomes'] = compas_prep_df['proba_compas'].apply(lambda x : 1 if x > 0.5 else 0)
compas_model_df['check'] = compas_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_model_df['check']) / len(compas_model_df))
compas_model_df = compas_model_df.drop(columns=['test_outcomes', 'check'])

0.6577488217355143


In [14]:
# Check accuracy of Logistic Regression ("actual probabilities")
compas_model_df['test_outcomes'] = compas_prep_df['proba_lr'].apply(lambda x : 1 if x > 0.5 else 0)
compas_model_df['check'] = compas_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_model_df['check']) / len(compas_model_df))
compas_model_df = compas_model_df.drop(columns=['test_outcomes', 'check'])

0.6607984474632659


In [15]:
# Write recipe outputs
if not os.path.exists("datasets"):
    os.mkdir("datasets")
compas_prep_df.to_csv("datasets/compas_prep.csv")