### Import Libraries

In [None]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression

# Read recipe inputs
german_df = pd.read_csv("compas.csv")
german_df

### Data Preparation

Summarize all columns and preview the dataset.

In [None]:
print("Summary of all columns of the dataset:")
for col in compas_df.columns:
    print("\n")
    print(compas_df.loc[:,col].value_counts())
compas_df.head(10)

In [None]:
# use treatment columns
col_map = {'Sex':'sex',
           'Race':'race',
           'Prior Offenses':'prior_offenses',
           'Under 25':'under_25',
           'ChargeDegree':'charge_degree',
           'COMPASPredictedDecileScore':'compas_risk_score',
           'ReoffendedWithinTwoYears':'outcomes'}
compas_prep_df = compas_df.rename(columns=col_map)
compas_prep_df.head(10)

1) Map COMPAS decile scores to probability of reoffending using Maximum Likelihood Estimator (reoffending rate for the corresponding group).

In [None]:
# probability mapping 1
probability_mapping_1 = compas_prep_df.groupby('compas_risk_score').mean()['outcomes'].to_dict()
probs_1_df = compas_prep_df['compas_risk_score'].map(probability_mapping_1)
print("MLE probability estimates for each COMPAS risk score:")
print(probability_mapping_1)
compas_prep_df['proba_compas'] = probs_1_df
compas_prep_df

2) Map decile scores and prior offenses to probability of reoffending using MLE (unused).

In [None]:
# probabilitiy mapping 2
# probability_mapping_2 = compas_df.groupby(['COMPASPredictedDecileScore','Prior Offenses']).mean()['ReoffendedWithinTwoYears'].to_dict()
# probs_2_df = pd.Series(list(zip(compas_df['COMPASPredictedDecileScore'], compas_df['Prior Offenses']))).map(probability_mapping_2)
# print("MLE probability estimates for each COMPAS risk score and prior offenses:")
# print(probability_mapping_2)

Build a "True" Model trained on the entire dataset's features and outcomes.

### Build Logistic Regression on Whole Dataset to Model "True Probabilities"

Select only covariates and outcomes to prepare for simulation.

In [None]:
# Exclude risk score
prob_cols = ['compas_risk_score', 'proba_compas']
compas_model_df = compas_prep_df.drop(columns=prob_cols)

One-Hot encode categorical features.

In [None]:
for cat_col in compas_model_df.select_dtypes(include=['object', 'bool']).columns:
    compas_model_df[cat_col] = compas_model_df[cat_col].astype('category')

In [None]:
X_num = compas_model_df.select_dtypes(exclude=['category'])
X_cat = compas_model_df.select_dtypes(include=['category'])

In [None]:
# one-hot encoding of categorical features
X_encoded = pd.get_dummies(X_cat)
frames = [X_encoded, X_num]
compas_model_df = pd.concat(frames, axis=1)
compas_model_df

In [None]:
extra_cols = ['sex_Female', 'prior_offenses_None', 'charge_degree_M', 'under_25_False']
compas_model_df = compas_model_df.drop(columns=extra_cols)
compas_model_df

In [None]:
X = compas_model_df.drop(columns=['outcomes'])
y = compas_model_df['outcomes']

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X, y)
proba = lr.predict_proba(X)[:,1]
compas_model_df['proba_lr'] = proba
compas_prep_df['proba_lr'] = proba
compas_prep_df

In [None]:
# Check accuracy of COMPAS predictions
compas_model_df['test_outcomes'] = compas_prep_df['proba_compas'].apply(lambda x : 1 if x > 0.5 else 0)
compas_model_df['check'] = compas_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_model_df['check']) / len(compas_model_df))
compas_model_df = compas_model_df.drop(columns=['test_outcomes', 'check'])

In [None]:
# Check accuracy of Logistic Regression ("actual probabilities")
compas_model_df['test_outcomes'] = compas_prep_df['proba_lr'].apply(lambda x : 1 if x > 0.5 else 0)
compas_model_df['check'] = compas_model_df.apply(lambda x : 1 if x['outcomes'] == x['test_outcomes'] else 0, axis=1)
print(np.sum(compas_model_df['check']) / len(compas_model_df))
compas_model_df = compas_model_df.drop(columns=['test_outcomes', 'check'])

In [None]:
# Write recipe outputs
if not os.path.exists("datasets"):
    os.mkdir("datasets")
german_prep_df.to_csv("datasets/compas_prep.csv")