In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
import math
from operator import add
import random
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score,roc_curve, auc

from ipynb.fs.full.GenerateData import generatedata
from ipynb.fs.full.AnalysisPartFunctions import initialize_parameters,initialize_dicts,calc_prior_mean,calc_post_mean,calc_post_sd,calc_Phi,calc_L,calc_component_derivs,calc_gradients,update_parameters, predict_choice

In [2]:
random.seed(110)

**(a) Generate data**

In [3]:
data = pd.DataFrame(generatedata())

In [4]:
data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,r_charge_desc,r_jail_in,r_jail_out,is_violent_recid,num_vr_cases,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,release
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,,0,,,,,,1
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,,0,,,,,,0
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,Felony Battery (Dom Strang),,,1,,13009779CF10A,(F3),2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),1
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,Driving Under The Influence,2013-06-16 09:05:47.000000,2013-06-16 07:18:55.000000,0,,,,,,0
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,,0,,,,,,0


check how many released/remanded

In [5]:
data.release.value_counts()

1    6800
0    4942
Name: release, dtype: int64

**(b) Create dummy variables for categorical variables**

In [6]:
sex_dummies = pd.get_dummies(data['sex'])
race_dummies = pd.get_dummies(data['race'])
age_dummies = pd.get_dummies(data['age_cat'])

data['sex_1_male'] = sex_dummies['Male']
data['African_American']=race_dummies['African-American']
data['Asian']=race_dummies['Asian']
data['Caucasian']=race_dummies['Caucasian']
data['Hispanic'] = race_dummies['Hispanic']
data['Native_American']=race_dummies['Native American']
data['race_Other']=race_dummies['Other']
data['Greater_45']=age_dummies['Greater than 45']
data['25_45']=age_dummies['25 - 45']
data['Less_25']=age_dummies['Less than 25']

In [7]:
data.columns

Index(['index', 'id', 'name', 'first', 'last', 'sex', 'race', 'dob', 'age',
       'age_cat', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'compas_screening_date', 'decile_score', 'score_text', 'violent_recid',
       'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
       'c_case_number', 'c_days_from_compas', 'c_arrest_date',
       'c_offense_date', 'c_charge_degree', 'c_charge_desc', 'is_recid',
       'num_r_cases', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'release',
       'sex_1_male', 'African_American', 'Asian', 'Caucasian', 'Hispanic',
       'Native_American', 'race_Other', 'Greater_45', '25_45', 'Less_25'],
      dtype='object')

**(c) Split data into training & test**

In [8]:
# select out relevant data & generate the train and test data

# train / test
features = ['decile_score','Less_25','25_45','Greater_45','sex_1_male','African_American','Caucasian','is_violent_recid','juv_fel_count']
X = data[features]
y = data['release']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=120)

risk_assess_train = X_train['decile_score']
X_train = X_train.drop(['decile_score'],axis=1)

risk_assess_test = X_test['decile_score'].tolist()
X_test = X_test.drop(['decile_score'],axis=1)
y_test = y_test.tolist()

Define variables for analysis without penalties

In [9]:
initial_parameters = initialize_parameters(X_train,1,10)
derivatives,grads = initialize_dicts()

In [10]:
initial_parameters

{'B': array([[0.09348775],
        [0.00869119],
        [0.08813534],
        [0.07249234],
        [0.08345957],
        [0.06469335],
        [0.02777332],
        [0.07446234]]),
 'b': 5.530548327250209,
 'q': 0.13408894502472324,
 'sd_prior': 9.887922138029468,
 'tau': 8}

In [11]:
parameters = {"B": initial_parameters['B'],
              "b": initial_parameters['b'],
              "q" : initial_parameters['q'],
              "sd_prior" : initial_parameters['sd_prior'],
              "tau" : initial_parameters['tau']}
parameters

{'B': array([[0.09348775],
        [0.00869119],
        [0.08813534],
        [0.07249234],
        [0.08345957],
        [0.06469335],
        [0.02777332],
        [0.07446234]]),
 'b': 5.530548327250209,
 'q': 0.13408894502472324,
 'sd_prior': 9.887922138029468,
 'tau': 8}

In [12]:
X = X_train
Y = y_train.tolist()
mu_ra = risk_assess_train.tolist()
num_int = 10
learning_rate = 0.2
Theta = [1,9]#,3,4,5,6,7,8,9,10]

# create containers for the optimized parameters for each 
param_info = {}
for i in range(len(Theta)):
    th = Theta[i]
    param_info[th]={}
    param_info[th]['auc']=[]
    param_info[th]['best_params']={}
    param_info[th]['best_params']['B']=[0]*X.shape[1]
    param_info[th]['best_params']['b']=0
    param_info[th]['best_params']['q']=0
    param_info[th]['best_params']['sd_prior']=0
    param_info[th]['best_params']['tau']=0

# learn model
for theta in Theta: 
    
    # reset the 'parameters' dict to the initial parameters values
    parameters = {"B": initial_parameters['B'],
                  "b": initial_parameters['b'],
                  "q" : initial_parameters['q'],
                  "sd_prior" : initial_parameters['sd_prior'],
                  "tau" : initial_parameters['tau']}
    
    loss = []
    j = 0

    print("params: ",parameters)
    
    while j < num_int:
        
        mu_prior = calc_prior_mean(X, parameters)
        mu_post = calc_post_mean(mu_prior, mu_ra, parameters, theta)
        sd_post =calc_post_sd(mu_prior, mu_ra, parameters, theta)
        Phi = calc_Phi(mu_post, sd_post, parameters)
        L = calc_L(Phi, Y)
        loss.append(L)

        derivatives = calc_component_derivs(X,parameters, derivatives, theta, mu_prior,mu_ra,mu_post,sd_post,Phi,Y)
        grads = calc_gradients(X,grads,derivatives)
        parameters = update_parameters(parameters, grads, learning_rate)
        print("intermediate params: ",parameters)

        if j%1==0:
            print("loss: ",L)
        j+=1
    
    print("optim params: ", parameters)
    
    # test these parameters to calculate auc and compare to others
    Phi_test = predict_choice(X_test,risk_assess_test,parameters,theta)
    
    # calculate the auc with the converged weights
    auc = roc_auc_score(y_test,Phi_test)
    param_info[theta]['auc']=auc
    print("auc: ",auc)
    
    # if there is an improvement in the AUC, update the 'best_params' with those weights
    param_info[theta]['best_params']['B'] = parameters['B']
    param_info[theta]['best_params']['b'] = parameters['b']
    param_info[theta]['best_params']['q'] = parameters['q']
    param_info[theta]['best_params']['sd_prior'] = parameters['sd_prior']
    param_info[theta]['best_params']['tau'] = parameters['tau']
        

params:  {'B': array([[0.09348775],
       [0.00869119],
       [0.08813534],
       [0.07249234],
       [0.08345957],
       [0.06469335],
       [0.02777332],
       [0.07446234]]), 'b': 5.530548327250209, 'q': 0.13408894502472324, 'sd_prior': 9.887922138029468, 'tau': 8}
intermediate params:  {'B': array([[0.09356759],
       [0.0088424 ],
       [0.08816986],
       [0.07269148],
       [0.08360894],
       [0.06477751],
       [0.02779451],
       [0.07448078]]), 'b': 5.530813903675095, 'q': 0.17038507489171062, 'sd_prior': 9.889038889512102, 'tau': 7.997752241554821}
loss:  0.9113872050153108
intermediate params:  {'B': array([[0.09364744],
       [0.00899361],
       [0.08820438],
       [0.07289062],
       [0.08375832],
       [0.06486168],
       [0.02781571],
       [0.07449922]]), 'b': 5.531079480099981, 'q': 0.206681204758698, 'sd_prior': 9.890155640994736, 'tau': 7.995504483109642}
loss:  0.8323030398218957
intermediate params:  {'B': array([[0.09372729],
       [0.00914

In [15]:
param_info[1]

{'auc': 0.449609132547013,
 'best_params': {'B': array([[0.09428622],
         [0.01020329],
         [0.08848053],
         [0.07448377],
         [0.08495331],
         [0.06553497],
         [0.02798528],
         [0.07464674]]),
  'b': 5.533204091499071,
  'q': 0.49705024369459705,
  'sd_prior': 9.899089652855809,
  'tau': 7.977522415548208}}

In [None]:
"""
ini_B = initial_parameters['B']
ini_b = initial_parameters['b']
ini_q = initial_parameters['q']
ini_sd = initial_parameters['sd_prior']
ini_tau = initial_parameters['tau']

X = X_train
Y = y_train.tolist()
mu_ra = risk_assess_train.tolist()
num_int = 10
learning_rate = 0.2

Theta = [1,9]#,3,4,5,6,7,8,9,10]

best_auc = 0
best_theta = 0
best_params = {"B": [0]*X.shape[1],
               "b": 0,
               "q" : 0,
               "sd_prior" : 0,
               "tau" : 0}

for theta in Theta: 
    
    # reset the 'parameters' dict to the initial parameters values
    parameters['B'] = ini_B
    parameters['b'] = ini_b
    parameters['q'] = ini_q
    parameters['sd_prior'] = ini_sd
    parameters['tau'] = ini_tau
    
    loss = []
    j = 0

    print("params: ",parameters)
    
    while j < num_int:
        
        mu_prior = calc_prior_mean(X, parameters)
        mu_post = calc_post_mean(mu_prior, mu_ra, parameters, theta)
        sd_post =calc_post_sd(mu_prior, mu_ra, parameters, theta)
        Phi = calc_Phi(mu_post, sd_post, parameters)
        L = calc_L(Phi, Y)
        loss.append(L)

        derivatives = calc_component_derivs(X,parameters, derivatives, theta, mu_prior,mu_ra,mu_post,sd_post,Phi,Y)
        grads = calc_gradients(X,grads,derivatives)
        parameters = update_parameters(parameters, grads, learning_rate)
        print("intermediate params: ",parameters)

        if j%1==0:
            print("loss: ",L)
        j+=1
    
    print("optim params: ", parameters)
    # test these parameters to calculate auc and compare to others
    mu_prior_test = calc_prior_mean(X_test, parameters)
    mu_post_test = calc_post_mean(mu_prior_test, risk_assess_test, parameters, theta)
    sd_post_test = calc_post_sd(mu_prior_test, risk_assess_test, parameters, theta)
    Phi_test = calc_Phi(mu_post_test, sd_post_test, parameters)
    
    # calculate the auc with the converged weights
    auc = roc_auc_score(y_test,Phi_test)
    print("auc: ",auc)
    
    # if there is an improvement in the AUC, update the 'best_params' with those weights
    if auc > best_auc:
        best_params['B'] = parameters['B']
        best_params['b'] = parameters['b']
        best_params['q'] = parameters['q']
        best_params['sd_prior'] = parameters['sd_prior']
        best_params['tau'] = parameters['tau']
        best_auc = auc
        best_theta = theta
"""

In [None]:
theta

In [None]:
best_params

In [None]:
mu_prior_test = calc_prior_mean(X_test, initial_parameters)
mu_post_test = calc_post_mean(mu_prior_test, risk_assess_test, initial_parameters, best_theta)
sd_post_test =calc_post_sd(mu_prior_test, risk_assess_test, initial_parameters, best_theta)
y_pred = Phi_test = calc_Phi(mu_post_test, sd_post_test, initial_parameters)

In [None]:
#y_pred = predict_choice(X_test,risk_assess_test,best_params,best_theta)

fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
roc_auc

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="upper left")
plt.show()

In [None]:
plt.hist(mu_post)

In [None]:
"""
parameters = {'B': array([[0.07177631],
        [0.07214579],
        [0.02697002],
        [0.03161949],
        [0.02467784],
        [0.03341922],
        [0.07176589],
        [0.08871475]]),
 'b': 7.037519546355625,
 'q': 0.30325946457835395,
 'sd_prior': 3.313766807141957,
 'tau': 9}
"""

Define variables for analysis with penalties

In [None]:
parameters = initialize_parameters(X_test,1,10)
derivatives,grads = initialize_dicts()

X = X_train
y = np.array(y_train)
mu_ra = np.array(risk_assess_train)
theta = 5
num_int = 100
learning_rate = 0.001
k = 1

In [None]:
parameters

In [None]:
loss = []
j = 0

while j < num_int:
    
    var_prior = parameters['var_prior'][0]
    q = parameters['q'][0]
    tau = parameters['tau']
        
    mu_prior = calc_prior_mean(X, parameters)[0]
    mu_post = calc_post_mean(mu_prior, mu_ra, q, theta)
    var_post = calc_post_var(mu_prior, mu_ra, var_prior, q, theta)
    Phi = calc_Phi(mu_post, var_post, tau)
    L = calc_L(Phi, y, mu_prior, k)
    loss.append(np.sum(L))
        
    if j%%5==0:
        print(L)
        
    derivatives = calc_component_derivs(X, y, parameters, derivatives, theta, mu_prior, mu_ra, mu_post, var_post, Phi, k)
    grads = calc_gradients(grads, derivatives, mu_prior, X)
    parameters = update_parameters(parameters, grads, learning_rate)
    
    j+=1
        


In [None]:
loss

In [None]:
loss = []

var_prior = parameters['var_prior'][0]
q = parameters['q'][0]
tau = parameters['tau']

mu_prior = calc_prior_mean(X, parameters)[0]
mu_post = calc_post_mean(mu_prior, mu_ra, q, theta)
var_post = calc_post_var(mu_prior, mu_ra, var_prior, q, theta)
Phi = calc_Phi(mu_post, var_post, tau)
L = calc_L(Phi, y, mu_prior, k)
loss.append(np.sum(L))
        
derivatives = calc_component_derivs(X, y, parameters, derivatives, theta, mu_prior, mu_ra, mu_post, var_post, Phi, k)
grads = calc_gradients(grads, derivatives, mu_prior, X)
parameters = update_parameters(parameters, grads, learning_rate)

In [None]:
print("B: ",parameters['B'])
print("b: ",parameters['b'])
print("q: ",parameters['q'])
print("var_prior: ",parameters['var_prior'])
print("tau: ",parameters['tau'])
print("loss: ",loss)
print("mu_prior: ",np.min(mu_prior),"to",np.max(mu_prior))
print("mu_post: ",np.min(mu_post),"to",np.max(mu_post))
print("Phi: ", np.min(Phi),"to",np.max(Phi))

In [None]:
parameters

In [None]:
var_prior = parameters['var_prior'][0]
q = parameters['q'][0]
tau = parameters['tau']

In [None]:
mu_prior = calc_prior_mean(X, parameters)[0]
mu_post = calc_post_mean(mu_prior, mu_ra, q, theta)
var_post = calc_post_var(mu_prior, mu_ra, var_prior, q, theta)
Phi = calc_Phi(mu_post, var_post, tau)
L = calc_L(Phi, y, mu_prior, k)

In [None]:
mu_prior

In [None]:
test1 = 10*((1-mu_prior)**3)
test2 = y_train*np.log(1-Phi)+(1-y)*np.log(Phi)
test = test1 + test2
test

In [None]:
loss.append(np.sum(L))
        
derivatives = calc_component_derivs(X, y, parameters, derivatives, theta, mu_prior, mu_ra, mu_post, var_post, Phi, k)
grads = calc_gradients(grads, derivatives, mu_prior, X)
parameters = update_parameters(parameters, grads, learning_rate)

In [None]:
parameters

In [None]:
derivatives

In [None]:
"""
# Predict test/train set examples 
#y_pred_train = predict(parameters,X_train,y_train,risk_assess_train,theta=5)
#y_pred_test = predict(parameters,X_test,y_test,risk_assess_test,theta=5)

# Print train/test Errors
# general accuracy
print("train accuracy: {} %".format(100 - np.mean(np.abs(y_pred_train - y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(np.abs(y_pred_test - y_test)) * 100))

# confusion matrix
confmat = []

for true,pred in zip(y_train,y_pred_train):
    if(true==1 and pred==1):
        confmat.append("tp")
    elif(true==1 and pred==0):
        confmat.append("fn")
    elif(true==0 and pred==0):
        confmat.append("tn")
    else:
        confmat.append(fp)

print("true positives: ",confmat.count('tp'))
print("false positives: ",confmat.count('fp'))
print("true negatives: ",confmat.count('tn'))
print("false negatives: ",confmat.count('fn'))

if (confmat.count('tp')+confmat.count('fn'))>0:
    print("sensitivity/recall: ",confmat.count('tp')/(confmat.count('tp')+confmat.count('fn')))
else:
    print("sensitivity/recall: N/A")
    
if (confmat.count('tn')+confmat.count('fp'))>0:
    print("specificity: ",confmat.count('tn')/(confmat.count('tn')+confmat.count('fp')))
else:
    print("specificity: N/A")
    
if (confmat.count('tp')+confmat.count('fp'))>0:
    print("precision: ",confmat.count('tp')/(confmat.count('tp')+confmat.count('fp')))
else:
    print("precision: N/A")
    
model_dict = {"nLL": L,
              "y_pred_test": y_pred_test,
              "y_pred_train" : y_pred_train,
              "parameters" : parameters
              "learning_rate" : learning_rate,
              "num_iterations": num_iterations}

#return model_dict
"""

In [None]:
def training_model(X,y,parameters,mu_ra,theta,num_int,learning_rate,k):
    
    mu_ra = np.array(mu_ra)
    y=np.array(y)
    loss = []
    j = 0
    
    while j < num_int:
        
        var_prior = parameters['var_prior'][0]
        q = parameters['q'][0]
        tau = parameters['tau']
        
        mu_prior = calc_prior_mean(X, parameters)[0]
        mu_post = calc_post_mean(mu_prior, mu_ra, q, theta)
        var_post = calc_post_var(mu_prior, mu_ra, var_prior, q, theta)
        Phi = calc_Phi(mu_post, var_post, tau)
        L = calc_L(Phi, y, mu_prior, k)
        loss.append(np.sum(L))
        
        derivatives = calc_component_derivs(X, parameters, theta, mu_prior, mu_ra, mu_post, var_post, Phi, k)
        grads = calc_gradients(derivatives, mu_prior, X)
        parameters = update_parameters(parameters, grads, learning_rate)
        #print(parameters)
        j+=1
        
    return loss, parameters, derivatives, grads

In [None]:
def predict(parameters,X,y,risk_assess,theta):
    
    tau = parameters['tau']
    q = parameters['q']
    mu_ra = np.array(risk_assess)
    mu_prior = calc_prior_mean(X,parameters)[0]
    var_prior = parameters['var_prior']
    #var_ra = calc_var_ra(var_prior,mu_prior,mu_ra,q,theta)
    
    mu_post = calc_post_mean(mu_prior, mu_ra, q, theta)
    var_post = calc_post_var(mu_prior, mu_ra, var_prior, q, theta)
    Phi = calc_Phi(mu_post,var_post,tau)
    #L = calc_L(Phi,y)
    
    y_pred = [1 if Phi[i]>0.5 else 0 for i in range(len(Phi))]
    
    return y_pred