# Reproducing COMPAS analysis and modelling in Python

## Part 1 - Load libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

def date_from_str(s):
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')

## Part 2 - Download data

In [2]:
!wget https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv

--2023-03-31 20:44:07--  https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2546489 (2.4M) [text/plain]
Saving to: ‘compas-scores-two-years.csv’


2023-03-31 20:44:08 (22.1 MB/s) - ‘compas-scores-two-years.csv’ saved [2546489/2546489]



## Part 3 - Import and prepare data
Broward county data filtered to only include those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

In [3]:
# Load CSV 
raw_data = pd.read_csv("compas-scores-two-years.csv")

In [4]:
print('Num rows: %d' %len(raw_data))

Num rows: 7214


In [5]:
# Select subset of columns
df = raw_data[['age', 'c_charge_degree', 'race', 'age_cat', 
               'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 
               'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out']]

To clean up data, the following considerations were made:
- Remove rows where charge data was not within 30 days from when the person was arrested (assumption of data quality issues)
- Remove is_recid -1 if as there is no matching COMPAS case
- Remove ordinary traffic offenses (c_charge_degree of 'O') as those will not result in jail time

In [6]:
# Apply filters
df = df[((df['days_b_screening_arrest'] <=30) & 
      (df['days_b_screening_arrest'] >= -30) &
      (df['is_recid'] != -1) &
      (df['c_charge_degree'] != 'O') & 
      (df['score_text'] != 'N/A')
     )]

print('Num rows filtered: %d' % len(df))

Num rows filtered: 6172


In [7]:
# Calculate lenght of stay in seconds for correlation
df['length_of_stay'] = (df.loc[:]['c_jail_out'].apply(date_from_str) - df.loc[:]['c_jail_in'].apply(date_from_str)).dt.total_seconds()

# Calculate correlation
stay_score_corr = pearsonr(df['length_of_stay'], df['decile_score'])[0]

# Print results
print('Correlation btw stay length and COMPAS scores: %.3f' % stay_score_corr)

Correlation btw stay length and COMPAS scores: 0.207


In [8]:
# Replace 'Caucasian' with '0_Caucasian' to ensure that the LR model will produce results with reference to Caucasian category
df.loc[df['race']=='Caucasian','race'] = '0_Caucasian'

In [9]:
# One hot encode the categorical data
df_crime = pd.get_dummies(df['c_charge_degree'],prefix='crime_factor',drop_first=True)
df_age = pd.get_dummies(df['age_cat'],prefix='age_factor',drop_first=True)
df_race = pd.get_dummies(df['race'],prefix='race_factor', drop_first=True)
df_gender = pd.get_dummies(df['sex'],prefix='gender_factor').drop('gender_factor_Male', axis = 1)

# Set up outcome score column
df_score = pd.get_dummies(df['score_text'] != 'Low',prefix='score_factor',drop_first=True)

In [10]:
# Join the one hot encoded categoricals
df_lr = pd.concat([df_crime, df_age, df_race,df_gender,
                   df['priors_count'], df['two_year_recid']
                  ],axis=1)



### Part 4 - Build classifier

In [11]:
# Extract feature 
X_train = df_lr.values
y_train = df_score.values.ravel()

# Initialize LR
lr = LogisticRegression(solver='newton-cg', penalty='none')

# Train model
lr.fit(X_train, y_train)

LogisticRegression(penalty='none', solver='newton-cg')

In [12]:
# Extract coefficients
results = pd.DataFrame(lr.coef_[0].reshape(1, -1), columns = df_lr.columns.tolist()).T.round(5).reset_index().rename({'index':'Coefficients', 0:'Estimate'}, axis =1)

print(results)

                    Coefficients  Estimate
0                 crime_factor_M  -0.31124
1     age_factor_Greater than 45  -1.35563
2        age_factor_Less than 25   1.30839
3   race_factor_African-American   0.47721
4              race_factor_Asian  -0.25441
5           race_factor_Hispanic  -0.42839
6    race_factor_Native American   1.39421
7              race_factor_Other  -0.82635
8           gender_factor_Female   0.22127
9                   priors_count   0.26895
10                two_year_recid   0.68586


In [13]:
def residual_deviance(X, y, model):
    return 2*log_loss(y, model.predict_proba(X), normalize=False)

residual_deviance(X_train, y_train, lr)

6168.401694733571

### Part 5 - Evaluate predictions

In [14]:
# Get predictions
y_train_pred = lr.predict(X_train)

In [15]:
# Get accuracy of model
accuracy_score(y_train, y_train_pred)

0.7519442644199611

In [16]:
control = np.exp(lr.intercept_[0]) / (1 + np.exp(lr.intercept_[0]))
np.exp(0.47721) / (1 - control + (control * np.exp(0.47721)))

1.4528406634846986

## Part 6 - Filter for African American and Caucasian

In [17]:
# df = df[df['race'].isin(['African-American', 'Caucasian'])==True]
# df.reset_index(drop=True, inplace=True)