In [520]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [521]:
cp = pd.read_csv('compas-scores-two-years.csv')
cp.shape

(7214, 53)

In [522]:
cp.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [523]:
cp['two_year_recid'].value_counts()

0    3963
1    3251
Name: two_year_recid, dtype: int64

In [524]:
high_risk = (cp['decile_score'] > 4)
high_risk.name = 'HighRisk'
high_risk.value_counts()

False    3897
True     3317
Name: HighRisk, dtype: int64

In [525]:
new_charge_2 = (cp['two_year_recid'] == 1)
new_charge_2.name = 'NewCharge2'
new_charge_2.value_counts()

False    3963
True     3251
Name: NewCharge2, dtype: int64

In [526]:
cp['race'].value_counts()

African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64

In [527]:
pd.crosstab(new_charge_2, high_risk)

HighRisk,False,True
NewCharge2,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2681,1282
True,1216,2035


In [528]:
white = (cp['race'] == 'Caucasian')
white.name = 'white'
white.value_counts()

False    4760
True     2454
Name: white, dtype: int64

In [529]:
pd.crosstab(new_charge_2[white], high_risk[white])

HighRisk,False,True
NewCharge2,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1139,349
True,461,505


In [530]:
black = (cp['race'] == 'African-American')
black.name = 'black'
black.value_counts()

True     3696
False    3518
Name: black, dtype: int64

In [531]:
pd.crosstab(new_charge_2[black], high_risk[black])

HighRisk,False,True
NewCharge2,Unnamed: 1_level_1,Unnamed: 2_level_1
False,990,805
True,532,1369


In [532]:
def confusion_matrix(cp):
    a = np.where(cp['decile_score'] > 4,
                 'Positive',
                 'Negative')
    high_risk = pd.Series(a, name='Predicted')

    a = np.where(cp['two_year_recid'] == 1,
                 'Condition',
                 'No Condition')
    new_charge_2 = pd.Series(a, name='Actual')

    matrix = pd.crosstab(high_risk, new_charge_2)
    matrix.sort_index(axis=0, ascending=False, inplace=True)

    return matrix

In [533]:
matrix_white = confusion_matrix(cp[white])
matrix_white

Actual,Condition,No Condition
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive,505,349
Negative,461,1139


In [534]:
matrix_black = confusion_matrix(cp[black])
matrix_black

Actual,Condition,No Condition
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive,1369,805
Negative,532,990


In [535]:
matrix_all = confusion_matrix(cp)
matrix_all

Actual,Condition,No Condition
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive,2035,1282
Negative,1216,2681


In [536]:
tp, fp, fn, tn = matrix_all.to_numpy().flatten()

In [537]:
accuracy = (tp + tn)*100/((tp + tn)+ (fp + fn))
accuracy

65.37288605489326

In [538]:
def predictive_value(m):
    tp, fp, fn, tn = m.to_numpy().flatten()
    ppv = tp*100/(tp+fp)
    npv = tn*100/(tn + fn)
    return ppv, npv

In [539]:
ppv, npv = predictive_value(matrix_all)
ppv, npv

(61.35061802833886, 68.79651013600206)

In [540]:
def sensitivty_specificity(m):
    tp, fp, fn, tn = m.to_numpy().flatten()
    sens = tp*100/(tp + fn)
    spec = tn*100/(tn + fp)
    return sens, spec

In [541]:
sens, spec = sensitivty_specificity(matrix_all)
sens, spec

(62.59612426945555, 67.65076961897552)

In [542]:
def error_rates(m):

    tp, fp, fn, tn = m.to_numpy().flatten()
    fpr = fp*100/(fp + tn)
    fnr = fn*100/(fn + tp)
    return fpr, fnr

In [543]:
fpr, fnr = error_rates(matrix_all)
fpr, fnr

(32.349230381024476, 37.40387573054445)

In [544]:
fpr + spec

100.0

In [545]:
fnr + sens

100.0

In [546]:
def prevalence(df):
    tp, fp, fn, tn = df.to_numpy().flatten()
    prevalence = (tp+fn)*100/((tp+fn)+(tn+fp)) 
    return prevalence

In [547]:
prev = prevalence(matrix_all)
prev

45.06515109509287

In [548]:
def compute_metrics(m, name=''):
    fpr, fnr = error_rates(m)
    ppv, npv = predictive_value(m)
    prev = prevalence(m)
    
    index = ['FP rate', 'FN rate', 'PPV', 'NPV', 'Prevalence']
    df = pd.DataFrame(index=index, columns=['Percent'])
    df.Percent = fpr, fnr, ppv, npv, prev
    df.index.name = name
    return df

In [549]:
compute_metrics(matrix_all, 'All defendants')

Unnamed: 0_level_0,Percent
All defendants,Unnamed: 1_level_1
FP rate,32.34923
FN rate,37.403876
PPV,61.350618
NPV,68.79651
Prevalence,45.065151


In [550]:
compute_metrics(matrix_black, 'Black defendants')

Unnamed: 0_level_0,Percent
Black defendants,Unnamed: 1_level_1
FP rate,44.846797
FN rate,27.985271
PPV,62.971481
NPV,65.045992
Prevalence,51.433983


In [551]:
compute_metrics(matrix_white, 'White defendants')

Unnamed: 0_level_0,Percent
White defendants,Unnamed: 1_level_1
FP rate,23.454301
FN rate,47.722567
PPV,59.133489
NPV,71.1875
Prevalence,39.364303


In [552]:
error_rates(matrix_black)

(44.84679665738162, 27.985270910047344)

In [553]:
error_rates(matrix_white)

(23.454301075268816, 47.72256728778468)