In [217]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [517]:
# Life essentials
import pandas as pd
import numpy as np
import pickle

# Plot
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')

# General Machine Learning
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Binary Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from  sklearn.neural_network import MLPClassifier
# Real Value Model
from sklearn.linear_model import LinearRegression
    

In [219]:
random_state = 17
held_out_size = 0.2
n_splits = 10

In [549]:
raw_df = pd.read_csv('compas-scores-two-years.csv')
raw_df = raw_df.replace('African-American', 'African_American')
raw_df.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [630]:
features = ['sex', 'age', 'juv_fel_count', 'juv_other_count', 
            'juv_misd_count', 'priors_count', 
            'race', 
            'c_charge_degree', 'c_charge_desc']

target = ['two_year_recid']
#target = ['decile_score']

In [631]:
df = raw_df[features + target]

# convert categorical variables into one-hot
df = pd.get_dummies(df, columns=['race', 'c_charge_degree', 'c_charge_degree', 'c_charge_desc'])
df['is_male'] = df.sex == 'Male'
df = df.drop(['sex'], axis=1)

#df.columns

In [632]:
features = [f for f in df.columns if f not in target]

X = StandardScaler().fit_transform(df[features])

X_train, X_held_out, y_train, y_held_out, indices_train, indices_test = train_test_split(
                                                            X,
                                                            df[target], 
                                                            df.index,
                                                            test_size = held_out_size, 
                                                            random_state = random_state)

# To avoid DataConversionWarning: A column-vector y was passed when a 1d array was expected.
y_train = np.ravel(y_train)

kfold = KFold(n_splits = n_splits,
              random_state = random_state)


In [633]:
model = LogisticRegression()
#model = DecisionTreeClassifier(max_depth=5)

model.fit(X_train, y_train)


result = {'Train_Score' : model.score(X_train, y_train),
          'Held_Out_Score' : model.score(X_held_out, y_held_out)}

print(result)

{'Train_Score': 0.70525038988043665, 'Held_Out_Score': 0.65835065835065831}


In [634]:
#pd.DataFrame(list(zip(features, np.round(model.coef_[0],3))))

def score_race(model, df, indices_test, race = 'African_American'):
    if race == 'African_American':
        indices_race = df[df.race_African_American == True].index
        indices_race_held_out = indices_race.intersection(indices_test)
        print('Test results, black population - ', len(indices_race_held_out))

    if race == 'Caucasian':
        indices_race = df[df.race_Caucasian == True].index
        indices_race_held_out = indices_white.intersection(indices_test)
        print('Test results, white population', len(indices_race_held_out))

    X = StandardScaler().fit_transform(df[features].iloc[indices_race_held_out])

    results_race = pd.DataFrame({'prediction' : model.predict(X), 
                                 'reality' : df.iloc[indices_race_held_out].two_year_recid})

    FP = results_race[(results_race.prediction == 1) & (results_race.reality == 0)].shape[0]
    print('False positive rate: ', round(100 * FP / results_race.shape[0], 3), '%', ' - ', FP)
    FN = results_race[(results_race.prediction == 0) & (results_race.reality == 1)].shape[0]
    print('False negative rate: ', round(100 * FN / results_race.shape[0], 3), '%', ' - ', FN)
    TP = results_race[(results_race.prediction == 1) & (results_race.reality == 1)].shape[0]
    print('True positive rate: ', round(100 * TP / results_race.shape[0], 3), '%', ' - ', TP)
    TN = results_race[(results_race.prediction == 0) & (results_race.reality == 0)].shape[0]
    print('True negative rate: ', round(100 * TN / results_race.shape[0], 3), '%', ' - ', TN)
    print()

In [635]:
score_race(model, df, indices_test)

score_race(model, df, indices_test, 'Caucasian')

Test results, black population -  738
False positive rate:  10.027 %  -  74
False negative rate:  22.493 %  -  166
True positive rate:  27.642 %  -  204
True negative rate:  39.837 %  -  294

Test results, white population 500
False positive rate:  13.8 %  -  69
False negative rate:  21.2 %  -  106
True positive rate:  19.2 %  -  96
True negative rate:  45.8 %  -  229

