In [217]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [218]:
# Life essentials
import pandas as pd
import numpy as np
import pickle

# Plot
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')

# General Machine Learning
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Binary Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from  sklearn.neural_network import MLPClassifier
# Real Value Model
from sklearn.linear_model import LinearRegression
    

In [219]:
random_state = 17
held_out_size = 0.2
n_splits = 10

In [326]:
raw_df = pd.read_csv('compas-scores-two-years.csv')
raw_df = raw_df.replace('African-American', 'African_American')

In [487]:
features = ['sex', 'age', 'juv_fel_count', 'juv_other_count', 
            'juv_misd_count', 'priors_count', 
            'race', 'r_charge_degree']

# 'age_cat' 'days_b_screening_arrest', 

# full of NA, unbalanced for non-recid 'c_days_from_compas',

target = ['two_year_recid']

not_used = ['id', 'name', 'first', 'last', 'compas_screening_date', 'dob',
'decile_score', 
'c_jail_in', 'c_jail_out', 'c_case_number',
'c_offense_date', 'c_arrest_date', 
'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
'vr_case_number', 'vr_charge_degree',
'vr_offense_date', 'vr_charge_desc', 'screening_date',
'v_screening_date', 'in_custody', 'out_custody',
'start', 'end', 'event', 
            'priors_count.1', 'is_recid', 
'type_of_assessment', 'decile_score.1', 'score_text'
'v_type_of_assessment', 'v_decile_score', 'v_score_text']



In [488]:
df = raw_df[features + target]

# convert categorical variables into one-hot
df = pd.get_dummies(df, columns=['race', 'r_charge_degree'])
df['is_male'] = df.sex == 'Male'
df = df.drop(['sex'], axis=1)

df.columns

Index(['age', 'juv_fel_count', 'juv_other_count', 'juv_misd_count',
       'priors_count', 'is_violent_recid', 'two_year_recid',
       'race_African_American', 'race_Asian', 'race_Caucasian',
       'race_Hispanic', 'race_Native American', 'race_Other',
       'r_charge_degree_(CO3)', 'r_charge_degree_(F1)', 'r_charge_degree_(F2)',
       'r_charge_degree_(F3)', 'r_charge_degree_(F5)', 'r_charge_degree_(F6)',
       'r_charge_degree_(F7)', 'r_charge_degree_(M1)', 'r_charge_degree_(M2)',
       'r_charge_degree_(MO3)', 'is_male'],
      dtype='object')

In [489]:
df.two_year_recid.value_counts()

0    3963
1    3251
Name: two_year_recid, dtype: int64

In [490]:
features = [f for f in df.columns if f not in target]

X = StandardScaler().fit_transform(df[features])

X_train, X_held_out, y_train, y_held_out, indices_train, indices_test = train_test_split(
                                                            X,
                                                            df[target], 
                                                            df.index,
                                                            test_size = held_out_size, 
                                                            random_state = random_state)

# To avoid DataConversionWarning: A column-vector y was passed when a 1d array was expected.
y_train = np.ravel(y_train)

kfold = KFold(n_splits = n_splits,
              random_state = random_state)


In [491]:
#model = LogisticRegression()
model = DecisionTreeClassifier(max_depth=5)

model.fit(X_train, y_train)


result = {'Train_Score' : model.score(X_train, y_train),
          'Held_Out_Score' : model.score(X_held_out, y_held_out)}

print(result)

{'Train_Score': 0.96378443943857217, 'Held_Out_Score': 0.96049896049896055}


In [492]:
#pd.DataFrame(list(zip(features, np.round(model.coef_[0],3))))


In [493]:
print('Results, white population')

indices_white = df[df.race_Caucasian == True].index
indices_white_held_out = indices_white.intersection(indices_test)

X = StandardScaler().fit_transform(df[features].iloc[indices_white_held_out])

results_white = pd.DataFrame({'prediction' : model.predict(X), 
                              'reality' : df.iloc[indices_white_held_out].two_year_recid})


FP = results_white[(results_white.prediction == 1) & (results_white.reality == 0)].shape[0]
print('False positive rate: ', round(100 * FP / results_white.shape[0], 3), '%')
FN = results_white[(results_white.prediction == 0) & (results_white.reality == 1)].shape[0]
print('False negative rate: ', round(100 * FN / results_white.shape[0], 3), '%')
TP = results_white[(results_white.prediction == 1) & (results_white.reality == 1)].shape[0]
print('True positive rate: ', round(100 * TP / results_white.shape[0], 3), '%')
TN = results_white[(results_white.prediction == 0) & (results_white.reality == 0)].shape[0]
print('True negative rate: ', round(100 * TN / results_white.shape[0], 3), '%')


Results, white population
False positive rate:  1.8 %
False negative rate:  0.6 %
True positive rate:  39.8 %
True negative rate:  57.8 %


In [494]:
print('Results, black population')

indices_black = df[df.race_African_American == True].index
indices_black_held_out = indices_black.intersection(indices_test)

X = StandardScaler().fit_transform(df[features].iloc[indices_black_held_out])

results_black = pd.DataFrame({'prediction' : model.predict(X), 
                              'reality' : df.iloc[indices_black_held_out].two_year_recid})


FP = results_black[(results_black.prediction == 1) & (results_black.reality == 0)].shape[0]
print('False positive rate: ', round(100 * FP / results_black.shape[0], 3), '%')
FN = results_black[(results_black.prediction == 0) & (results_black.reality == 1)].shape[0]
print('False negative rate: ', round(100 * FN / results_black.shape[0], 3), '%')
TP = results_black[(results_black.prediction == 1) & (results_black.reality == 1)].shape[0]
print('True positive rate: ', round(100 * TP / results_black.shape[0], 3), '%')
TN = results_black[(results_black.prediction == 0) & (results_black.reality == 0)].shape[0]
print('True negative rate: ', round(100 * TN / results_black.shape[0], 3), '%')


Results, black population
False positive rate:  3.93 %
False negative rate:  0.813 %
True positive rate:  49.322 %
True negative rate:  45.935 %


In [452]:
from sklearn import tree
model
tree.export_graphviz(model, out_file=None)   

'digraph Tree {\nnode [shape=box] ;\n0 [label="X[19] <= 0.895\\ngini = 0.495\\nsamples = 5771\\nvalue = [3165, 2606]"] ;\n1 [label="gini = 0.458\\nsamples = 4808\\nvalue = [3102, 1706]"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="gini = 0.122\\nsamples = 963\\nvalue = [63, 900]"] ;\n0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;\n}'

In [478]:
print('AA Low', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text == 'Low')]))
print('TN:', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text == 'Low') & (raw_df.two_year_recid == False)]))
print('FN:', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text == 'Low') & (raw_df.two_year_recid == True)]))
print('AA High', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text != 'Low')]))
print('TP:', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text != 'Low') & (raw_df.two_year_recid == True)]))
print('FP:', len(raw_df[(raw_df.race == 'African_American') & (raw_df.score_text != 'Low') & (raw_df.two_year_recid == False)]))
print()
print('C Low', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text == 'Low')]))
print('TN:', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text == 'Low') & (raw_df.two_year_recid == False)]))
print('FN:', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text == 'Low') & (raw_df.two_year_recid == True)]))
print('C High', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text != 'Low')]))
print('TP:', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text != 'Low') & (raw_df.two_year_recid == True)]))
print('FP:', len(raw_df[(raw_df.race == 'Caucasian') & (raw_df.score_text != 'Low') & (raw_df.two_year_recid == False)]))


AA Low 1522
TN: 990
FN: 532
AA High 2174
TP: 1369
FP: 805

C Low 1600
TN: 1139
FN: 461
C High 854
TP: 505
FP: 349
