In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
import math
from operator import add
import random
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score,roc_curve, auc
#ipynb.fs.full.AnalysisPartFunctions_GDTHETA
from ipynb.fs.full.GenerateData import generatedata
from ipynb.fs.full.AnalysisPartFunctions_Sigmoid import initialize_parameters,initialize_dicts,calc_prior_mean,calc_post_mean,calc_post_sd,calc_y,calc_L,calc_component_derivs,calc_gradients,update_parameters, predict_choice

**(a) Generate data**

In [2]:
data = pd.read_csv('data/people.csv')

In [3]:
data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,is_violent_recid,num_vr_cases,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,,,0,,,,,
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,,,0,,,,,
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),,,1,,13009779CF10A,(F3),2013-07-05 00:00:00.000000,Felony Battery (Dom Strang)
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,2013-06-16 00:00:00.000000,Driving Under The Influence,2013-06-16 09:05:47.000000,2013-06-16 07:18:55.000000,0,,,,,
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,,,0,,,,,


**(b) Create dummy variables for categorical variables**

In [4]:
sex_dummies = pd.get_dummies(data['sex'])
race_dummies = pd.get_dummies(data['race'])
age_dummies = pd.get_dummies(data['age_cat'])

data['sex_1_male'] = sex_dummies['Male']
data['African_American']=race_dummies['African-American']
data['Asian']=race_dummies['Asian']
data['Caucasian']=race_dummies['Caucasian']
data['Hispanic'] = race_dummies['Hispanic']
data['Native_American']=race_dummies['Native American']
data['race_Other']=race_dummies['Other']
data['Greater_45']=age_dummies['Greater than 45']
data['25_45']=age_dummies['25 - 45']
data['Less_25']=age_dummies['Less than 25']

In [5]:
data.columns

Index(['index', 'id', 'name', 'first', 'last', 'sex', 'race', 'dob', 'age',
       'age_cat', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'compas_screening_date', 'decile_score', 'score_text', 'violent_recid',
       'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
       'c_case_number', 'c_days_from_compas', 'c_arrest_date',
       'c_offense_date', 'c_charge_degree', 'c_charge_desc', 'is_recid',
       'num_r_cases', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'sex_1_male',
       'African_American', 'Asian', 'Caucasian', 'Hispanic', 'Native_American',
       'race_Other', 'Greater_45', '25_45', 'Less_25'],
      dtype='object')

**(c) Split data into training & test**

In [6]:
# select out relevant data & generate the train and test data

# train / test
features = ['Less_25','25_45','Greater_45','sex_1_male','African_American','Caucasian','is_violent_recid','juv_fel_count']
X = data[features]
mu_ra = data['decile_score'].tolist()

# need to calculate the 'release' value (i.e. 'y')
synth_params = {'B': np.array([[0.57140259],
                     [0.42888905],
                     [0.5780913 ],
                     [0.20609823],
                     [0.81332125],
                     [0.82358887],
                     [0.65347253],
                     [0.16022956]]),
                'b': 5.206693596399246,
                'q': 0.32777281162209315,
                'sd_prior': 2.4999667668640035,
                'tau': 6,
                'Theta': 1,
                'scale': 25}

synth_mu_prior = calc_prior_mean(X, synth_params)
synth_mu_post = calc_post_mean(synth_mu_prior, mu_ra, synth_params)
synth_sd_post =calc_post_sd(synth_mu_prior, mu_ra, synth_params)
synth_y_pred = calc_y(synth_mu_post, synth_sd_post, synth_params)
y = [1 if syP>0.5 else 0 for syP in synth_y_pred]
values, counts = np.unique(y, return_counts=True)
print(values,counts)

[0 1] [ 1697 10060]


Create a list of indices to pull out 70% of observations as a train set

In [7]:
train_index = random.sample(range(len(X)),math.floor(len(X)*0.7))
test_index = [i for i in range(len(X)) if i not in train_index ]
print(len(test_index),len(train_index))

3528 8229


In [8]:
X_train = data.loc[train_index,features]
mu_ra_train = data.loc[train_index,'decile_score'].tolist()
y_train = [y[i] for i in train_index]

X_test = data.loc[test_index,features]
mu_ra_test = data.loc[test_index,'decile_score'].tolist()
y_test = [y[i] for i in test_index]

Define variables for analysis without penalties

In [9]:
initial_parameters = initialize_parameters(X_train,1,10)
derivatives,grads = initialize_dicts()

In [10]:
initial_parameters

{'B': array([[0.95820938],
        [0.14036859],
        [0.02361615],
        [0.99863065],
        [0.18425365],
        [0.12059206],
        [0.65142124],
        [0.34564484]]),
 'b': 8.89550939795803,
 'q': 0.23174148998607602,
 'sd_prior': 9.594850618366994,
 'tau': 6,
 'Theta': 10,
 'scale': 25}

In [11]:
parameters = {
    "B": initial_parameters['B'],
    "b": initial_parameters['b'],
    "q" : initial_parameters['q'],
    "sd_prior" : initial_parameters['sd_prior'],
    "tau" : initial_parameters['tau'],
    "Theta" : initial_parameters['Theta'],
    "scale" : initial_parameters['scale']
}

In [None]:
num_int = 10

prev_L = 0 
loss = []

init_lr = 0.001
learning_rate = []

j = 0
    
lr=init_lr # remove when want dynamic lr

while j < num_int:
    
    mu_prior = calc_prior_mean(X_train, parameters)
    mu_post = calc_post_mean(mu_prior, mu_ra_train, parameters)
    sd_post =calc_post_sd(mu_prior, mu_ra_train, parameters)
    y_pred = calc_y(mu_post, sd_post, parameters)
    L = calc_L(y_pred, y_train)
    loss.append(L)
        
    # set learning rate: if loss is negative double learning rate; else halve learning rate
    if j==0:
        lr=init_lr
    elif prev_L>L and lr<=0.5:
        lr = lr*2
    elif prev_L>L and lr>0.5:
        lr = 1
    elif prev_L<L:
        lr = lr/2

    learning_rate.append(lr)
        
    derivatives = calc_component_derivs(X_train,parameters, derivatives, mu_prior, mu_ra_train, mu_post, sd_post, y_pred, y_train)
    grads = calc_gradients(X_train,grads,derivatives)
    parameters = update_parameters(parameters, grads, lr)
    prev_L = L

        
    # print loss so we can track the progress of the model
    if j%1==0:
        print("loss: ", L)
        #print("theta: "+str(parameters['Theta']))
    j+=1
    

In [None]:
parameters

In [None]:
initial_parameters

In [None]:
synth_params

In [None]:
plt.plot(learning_rate)
plt.xlabel('Iteration')
plt.ylabel('Learning Rate')
plt.legend(loc="lower left")

In [None]:
plt.plot(loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend(loc="upper right")

In [None]:
from sklearn.metrics import confusion_matrix

y_pred_convert = pd.Series([1 if y>0.5 else 0 for y in y_pred],name="Predicted")
y_actual = pd.Series(y_test,name="Actual")
confusion = pd.crosstab(y_actual, y_pred_convert,margins=True)
confusion

In [None]:
y_pred = predict_choice(X_test,mu_ra_test,parameters)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="upper left")

In [None]:
plt.hist(mu_prior,bins=20,alpha=0.4,color='m')
plt.hist(mu_post,bins=20,alpha=0.5,color='g')

In [None]:
x = np.linspace(0, 10, 100)
idx = 1000

y_prior = norm.pdf(x,mu_prior[idx],parameters['sd_prior'])
y_post = norm.pdf(x,mu_post[idx],sd_post[idx]) 

plt.plot(x,y_prior, color='m')
plt.axvline(x=mu_prior[idx], ymin=0, ymax=1,linewidth=1,linestyle='--',color='m')
plt.plot(x,y_post,color='g')
plt.axvline(x=mu_post[idx], ymin=0, ymax=1,linewidth=1,linestyle='--',color='g')
plt.axvline(x=mu_ra[idx], ymin=0, ymax=1,color='r')
plt.axvline(x=parameters['tau'], ymin=0, ymax=1,color='black')

print('Theta: ', parameters['Theta'])
print('SD prior: ', parameters['sd_prior'])
print('SD posterior: ', sd_post[idx])

In [None]:
y_pred_logit = pd.Series(logreg.predict(X_test),name="Predicted")
y_actual = pd.Series(y_test,name="Actual")
confusion_logit = pd.crosstab(y_actual, y_pred_logit,margins=True)
confusion_logit

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_logit)
roc_auc = roc_auc_score(y_test, y_pred_logit)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="upper left")

In [None]:
mu_prior_init = calc_prior_mean(X_train, initial_parameters)
mu_post_init = calc_post_mean(mu_prior_init, mu_ra_train, initial_parameters)
sd_post_init =calc_post_sd(mu_prior_init, mu_ra_train, initial_parameters)
y_pred_init = calc_y(mu_post_init, sd_post_init, parameters)

y_pred_init_convert = pd.Series([1 if y>0.5 else 0 for y in y_pred_init],name="Predicted")
y_actual = pd.Series(y_test,name="Actual")
confusion = pd.crosstab(y_actual, y_pred_init_convert,margins=True)
confusion

In [None]:
y_pred = predict_choice(X_test,mu_ra_test,initial_parameters)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="upper left")