In [10]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? 
Nothing done.


In [11]:
# Life essentials
import pandas as pd
import numpy as np
import pickle

# Plot
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
#matplotlib.style.use('ggplot')

# General Machine Learning
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Binary Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from  sklearn.neural_network import MLPClassifier
# Real Value Model
from sklearn.linear_model import LinearRegression
    

In [12]:
random_state = 17
held_out_size = 0.2
n_splits = 10

In [13]:
raw_df = pd.read_csv('compas-data/compas-scores-two-years.csv')
raw_df = raw_df.replace('African-American', 'African_American')
raw_df.columns

Index([u'id', u'name', u'first', u'last', u'compas_screening_date', u'sex',
       u'dob', u'age', u'age_cat', u'race', u'juv_fel_count', u'decile_score',
       u'juv_misd_count', u'juv_other_count', u'priors_count',
       u'days_b_screening_arrest', u'c_jail_in', u'c_jail_out',
       u'c_case_number', u'c_offense_date', u'c_arrest_date',
       u'c_days_from_compas', u'c_charge_degree', u'c_charge_desc',
       u'is_recid', u'r_case_number', u'r_charge_degree',
       u'r_days_from_arrest', u'r_offense_date', u'r_charge_desc',
       u'r_jail_in', u'r_jail_out', u'violent_recid', u'is_violent_recid',
       u'vr_case_number', u'vr_charge_degree', u'vr_offense_date',
       u'vr_charge_desc', u'type_of_assessment', u'decile_score.1',
       u'score_text', u'screening_date', u'v_type_of_assessment',
       u'v_decile_score', u'v_score_text', u'v_screening_date', u'in_custody',
       u'out_custody', u'priors_count.1', u'start', u'end', u'event',
       u'two_year_recid'],
      dtype

In [17]:
features = ['sex', 'age', 'juv_fel_count', 'juv_other_count', 
            'juv_misd_count', 'priors_count', 
            'race', 'c_charge_degree']

target = ['two_year_recid']

df = raw_df[features + target]


In [18]:

# convert categorical variables into one-hot
df = pd.get_dummies(df, columns=['race', 'c_charge_degree'])
df['is_male'] = df.sex == 'Male'
df = df.drop(['sex'], axis=1)

#df.columns

In [19]:
df.columns

Index([u'age', u'juv_fel_count', u'juv_other_count', u'juv_misd_count',
       u'priors_count', u'two_year_recid', u'race_African_American',
       u'race_Asian', u'race_Caucasian', u'race_Hispanic',
       u'race_Native American', u'race_Other', u'c_charge_degree_F',
       u'c_charge_degree_M', u'is_male'],
      dtype='object')

In [20]:
features = [f for f in df.columns if f not in target]

X = StandardScaler().fit_transform(df[features])

X_train, X_held_out, y_train, y_held_out, indices_train, indices_test = train_test_split(
                                                            X,
                                                            df[target], 
                                                            df.index,
                                                            test_size = held_out_size, 
                                                            random_state = random_state)

# To avoid DataConversionWarning: A column-vector y was passed when a 1d array was expected.
y_train = np.ravel(y_train)

kfold = KFold(n_splits = n_splits,
              random_state = random_state)


In [21]:
model = LogisticRegression()
#model = DecisionTreeClassifier(max_depth=5)

model.fit(X_train, y_train)


result = {'Train_Score' : model.score(X_train, y_train),
          'Held_Out_Score' : model.score(X_held_out, y_held_out)}

print(result)
model.coef_

{'Train_Score': 0.6787385201871426, 'Held_Out_Score': 0.6645876645876646}


array([[-0.51712346,  0.07715247,  0.15115428,  0.02718   ,  0.68269981,
         0.03973855, -0.02120995,  0.00324777, -0.04588637,  0.02189169,
        -0.03623983,  0.0441063 , -0.0441063 ,  0.109681  ]])

In [24]:
#pd.DataFrame(list(zip(features, np.round(model.coef_[0],3))))

def score_race(model, df, indices_test, race = 'African_American'):
    if race == 'African_American':
        indices_race = df[df.race_African_American == True].index
        indices_race_held_out = indices_race.intersection(indices_test)
        print('Test results, black population - ', len(indices_race_held_out))

    if race == 'Caucasian':
        indices_race = df[df.race_Caucasian == True].index
        indices_race_held_out = indices_race.intersection(indices_test)
        print('Test results, white population', len(indices_race_held_out))

    X = StandardScaler().fit_transform(df[features].iloc[indices_race_held_out])

    results_race = pd.DataFrame({'prediction' : model.predict(X), 
                                 'reality' : df.iloc[indices_race_held_out].two_year_recid})

    FP = results_race[(results_race.prediction == 1) & (results_race.reality == 0)].shape[0]
    print('False positive rate: ', round(100 * FP / results_race.shape[0], 3), '%', ' - ', FP)
    FN = results_race[(results_race.prediction == 0) & (results_race.reality == 1)].shape[0]
    print('False negative rate: ', round(100 * FN / results_race.shape[0], 3), '%', ' - ', FN)
    TP = results_race[(results_race.prediction == 1) & (results_race.reality == 1)].shape[0]
    print('True positive rate: ', round(100 * TP / results_race.shape[0], 3), '%', ' - ', TP)
    TN = results_race[(results_race.prediction == 0) & (results_race.reality == 0)].shape[0]
    print('True negative rate: ', round(100 * TN / results_race.shape[0], 3), '%', ' - ', TN)
    print()

In [25]:
score_race(model, df, indices_test)

score_race(model, df, indices_test, 'Caucasian')

('Test results, black population - ', 738)
('False positive rate: ', 8.0, '%', ' - ', 64)
('False negative rate: ', 24.0, '%', ' - ', 181)
('True positive rate: ', 25.0, '%', ' - ', 189)
('True negative rate: ', 41.0, '%', ' - ', 304)
()
('Test results, white population', 500)
('False positive rate: ', 13.0, '%', ' - ', 69)
('False negative rate: ', 20.0, '%', ' - ', 102)
('True positive rate: ', 20.0, '%', ' - ', 100)
('True negative rate: ', 45.0, '%', ' - ', 229)
()


In [26]:
parameters = [1] * X_train.shape[1] 

np.dot(parameters, X_train[0])

-0.35544836268953717

## Attempt  to cuztomize sklearn loss function

In [29]:
from scipy.optimize import minimize

parameters = [0] * X_train.shape[1] 

def lik(parameters, X = X_train, Y = y_train):
    L = 0
    for i in range(0, X.shape[0]):
        if Y[i] == 1:
            p_sample = 1 / (1 + np.exp(np.dot(parameters, X[i])))
        if Y[i] == 0:
            p_sample = 1 - 1 / (1 + np.exp(np.dot(parameters, X[i])))
        L = L - p_sample
    return L

lik_model = minimize(lik, parameters, method='L-BFGS-B')
lik_model

      fun: -3988.994754291477
 hess_inv: <14x14 LbfgsInvHessProduct with dtype=float64>
      jac: array([ 2.27373675e-04, -4.54747351e-05,  4.54747351e-05,  4.54747351e-05,
        1.81898940e-04,  0.00000000e+00,  4.54747351e-05, -4.54747351e-05,
       -3.18323146e-04,  9.09494702e-05,  0.00000000e+00, -2.27373675e-04,
        1.81898940e-04,  3.18323146e-04])
  message: 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 660
      nit: 36
   status: 0
  success: True
        x: array([ 2235.76830682, -1089.97155367,  -621.2973802 ,  -136.9442596 ,
       -3822.47471889,  -117.89962559,    47.64752582,   191.91179462,
          40.18875822,   -24.01086583,  -204.09391604,    26.61786147,
         -26.65768631,  -153.24618458])