In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
import random
from itertools import chain
from collections import Counter
import numpy as np
import scipy

from ipynb.fs.full.GenerateData import generatedata


**0) Define variables**


 $i \in I$ Individual defendants
 
 $x_{1,i}...x_{n,i}$  Features of individual defendants
 
 $j \in J$  Individual judges
 
 $P_{prior,j,i}$  Judge $j$'s prediction probability prior on defendant $i$
 
 $P_{algo,i}$  Risk assessment algorithm's prediction probability on defendant $i$
 
 $P_{post,j,i}$  Judge $j$'s prediction posterior probability on defendant $i$
 
 $w_{j}$  Anchoring effect scaling for a judge $j$
 
 $\theta_{j,i}$  Judge $j$'s threshold for releasing defendant $i$
 
 $y_{j,i}$  Known decision of judge $j$ to release defendant $i$ on bail
 
 $J(\beta_{0}, \beta_{1}, \beta_{2}, w)$  Cost function
 
 $P_{priorodds,j,i}$  Judge $j$'s posterior log odds for defendant $i$
 
 $\delta$  Threshold between $P_{prior}$ and $P_{algo}$



**1) Prepare the data for analysis**

**(a) Load data**

In [2]:
data = generatedata()

In [3]:
data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,r_charge_desc,r_jail_in,r_jail_out,is_violent_recid,num_vr_cases,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,release
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,,0,,,,,,1
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,,0,,,,,,0
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,Felony Battery (Dom Strang),,,1,,13009779CF10A,(F3),2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),1
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,Driving Under The Influence,2013-06-16 09:05:47.000000,2013-06-16 07:18:55.000000,0,,,,,,1
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,,0,,,,,,0


In [4]:
# check how many released/remanded
data.release.value_counts()

1    6742
0    5000
Name: release, dtype: int64

In [5]:
# check how many released/remanded by decile_score
pd.crosstab(data.decile_score, data.release)

release,0,1
decile_score,Unnamed: 1_level_1,Unnamed: 2_level_1
1,231,2346
2,151,1421
3,125,1134
4,627,572
5,539,495
6,515,478
7,821,79
8,727,69
9,717,85
10,547,63


**(b) Create dummy variables for categorical variables**

In [6]:
# create dummy variables
sex_dummies = pd.get_dummies(data['sex'])
race_dummies = pd.get_dummies(data['race'])

data['sex_1_male'] = sex_dummies['Male']
data['African_American']=race_dummies['African-American']
data['Asian']=race_dummies['Asian']
data['Caucasian']=race_dummies['Caucasian']
data['Hispanic'] = race_dummies['Hispanic']
data['Native_American']=race_dummies['Native American']
data['Other']=race_dummies['Other']

data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,vr_offense_date,vr_charge_desc,release,sex_1_male,African_American,Asian,Caucasian,Hispanic,Native_American,Other
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,1,1,0,0,0,0,0,1
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,0,1,0,0,1,0,0,0
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),1,1,1,0,0,0,0,0
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,,,1,1,1,0,0,0,0,0
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,0,1,1,0,0,0,0,0


**(c) Split data into training & test**

In [7]:
# select out relevant data & generate the train and test data

data = data[['release','decile_score','age','is_violent_recid']] # NOTE::: This is just a subset of features we would examine in reality

# train / test
X = data.drop(['release'],axis=1)
y = data['release']
class_names = data.release.unique()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=212)

P_alg_train = X_train['decile_score']/10
P_alg_test = X_test['decile_score']/10

X_train = X_train.drop(['decile_score'],axis=1)
X_test = X_test.drop(['decile_score'],axis=1)

**2) Helper functions**

**(a) Initialize parameters**

$\beta_{1}$, $\beta_{2}$, and $w$ are initialized as random numbers;
$b$ is initialized as zeros

In [8]:
def initialize_parameters(num_params):
    """
    Argument:
    num_params -- the number of parameters in the training data
    
    Returns:
    params -- python dictionary containing model parameters:
    B -- coefficient matrix
    b -- constant
    w -- constant
    """
    
    np.random.seed(212) 
    
    B = np.random.randn(num_params,1)*0.01 # randomly initialize the B_0, B_1 coefficients
    b = 0 # intialize  constants w,b @ 0
    w = 0

    
    parameters = {"B": B,
                  "b": b,
                  "w" : w}
    
    return parameters

In [9]:
parameters = initialize_parameters(2)
parameters

{'B': array([[ 0.00229347],
        [-0.00148797]]), 'b': 0, 'w': 0}

**(b) Convert prior log odds to probabilities**

Judges prior beliefs about defendant, $P_{priorodds,j,i}$, are converted to probabilities usings sigmoid function.

\begin{equation}
P_{priorodds,j,i} = \beta_{0,j}+\beta_{1,j}x_{1,i}+\beta_{2,j}x_{2,i}
\end{equation}

\begin{equation}
P_{prior,j,i} = \sigma (P_{priorodds,j,i})
\end{equation}

In [10]:
def logit(z):
    """
    Compute the sigmoid of z

    Arguments:
    z -- A scalar or numpy array 

    Return:
    s -- sigmoid(z)
    """

    s = np.exp(-z)/(1+np.exp(-z))
    
    return s

**3) Calculate loss**

Given a set {$\beta_0, \beta_1, \beta_2, w$}, want to calculate loss, $\mathcal{L}(P_{post,j,i},y_i)$ using the following steps:

\begin{equation}
P_{priorodds,j,i} = \beta_{0,j}+\beta_{1,j}x_{1,i}+\beta_{2,j}x_{2,j}
\end{equation}

\begin{equation}
P_{prior,j,i} = \sigma(P_{priorodds,j,i})
\end{equation}

\begin{equation}
P_{post,j,i} = \left \{
	\begin{array}{ll}
	wP_{algo,i}+(1-w)P_{prior,j,i} & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	P_{prior,j,i} & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\mathcal{L}(P_{post,j,i}, y_i) = - (y_i \log P_{post,j,i} - (1-y_i) \log (1-P_{post,j,i}))
\end{equation}

\begin{equation}
J_{\{\beta_0,\beta_1,\beta_2,w\}} = \frac{1}{||I_j||}\sum \mathcal{L}(P_{post,j,i},y_i)
\end{equation}

In [78]:
def forward(X, parameters, P_algo, delta):
    """
    Argument:
    X -- input data of size (# obs, # params)
    parameters -- python dictionary containing your parameters (output of initialization function)
    delta -- threshold for difference betwen P_prior & P_algo
    
    Returns:
    P_post -- The sigmoid output 
    cache -- a dictionary containing P_priorodds,P_prior,P_post,P_alg_train
    """
    # Retrieve each parameter from the dictionary "parameters"
    B = parameters['B']
    b = parameters['b']
    w = parameters['w']
    
    # Calculate P_post (probabilities) using forward propagation
    P_priorodds = np.dot(B.T,X.T)+b
    P_prior = logit(P_priorodds) # transform the log odds to probabilities
    P_post = [(w*P_algo.iloc[i]) + ((1-w)*P_prior[0][i])
              if abs(P_prior[0][i]-P_algo.iloc[i]) <= delta
              else P_prior[0][i]
              for i in range(P_prior.shape[1])] # Caluclate P_post as a scaled P_prior

    for i in range(P_prior.shape[1]):
        if P_prior[0][i] < 0:
            P_prior[0][i] = 0

    for i in range(len(P_post)):
        if P_post[i] < 0:
            P_post[i] = 0
    
    cache = {"P_priorodds": P_priorodds,
             "P_prior": P_prior,
             "P_post": P_post,
             "P_algo": P_algo}
    
    return P_post, cache

In [79]:
P_post, cache = forward(X_train, parameters, P_alg_train, delta=2)
P_post

[1.1548043975285174,
 1.1548043975285174,
 1.1354538478374527,
 1.1354538478374527,
 1.0387010993821237,
 1.0,
 1.174154947219582,
 1.174154947219582,
 1.1548043975285174,
 1.1548043975285174,
 1.0774021987642575,
 1.0774021987642588,
 1.0774021987642588,
 1.0387010993821293,
 1.1548043975285172,
 1.1354538478374527,
 1.1354538478374527,
 1.1354538478374527,
 1.0,
 1.174154947219582,
 1.0967527484553234,
 1.0193505496910646,
 1.0774021987642532,
 1.1548043975285174,
 1.174154947219582,
 1.1354538478374527,
 1.0774021987642588,
 1.174154947219582,
 1.174154947219582,
 1.1354538478374527,
 1.0193505496910646,
 1.0387010993821046,
 1.1354538478374527,
 1.0387010993821293,
 1.116103298146388,
 1.0967527484552126,
 1.1548043975285174,
 1.174154947219582,
 1.174154947219582,
 1.1548043975285174,
 1.038701099382129,
 1.174154947219582,
 1.0580516490731928,
 1.0967527484553234,
 1.1354538478374527,
 1.116103298146388,
 1.0967527484553234,
 1.1354538478374527,
 1.174154947219582,
 1.09675274845

In [83]:
max(P_post)

1.174154947219582

In [80]:
def compute_cost(P_post, Y, parameters):
    """   
    Arguments:
    P_post -- The scaled sigmoid output (probability)
    Y -- "true" labels 
    parameters -- python dictionary containing parameters B, b, w
    
    Returns:
    cost -- cross-entropy cost
    
    """

    # Compute the cross-entropy cost
    
    #### NEED TO CONSTRAIN TO [0,1] SOMEHOW - THROUGH NORMALIZATION? 
    """
    y_train*np.log(P_post)
    
    """
    part1 = [a*b for a,b in zip(np.log(P_post),pd.Series.tolist(Y))]
    part2 = [a*b for a,b in zip(np.log(np.subtract(1,P_post)),1-Y)]
    logprobs = -np.subtract(part1,part2)
    cost = np.sum(logprobs)/Y.shape[0] # sum and divide by size of sample to produce cost
    
    cost = float(np.squeeze(cost))  
    assert(isinstance(cost, float))

    return cost

In [84]:
y_train*np.log(P_post)

1434     1.439310e-01
4425     1.439310e-01
37       1.270324e-01
19       1.270324e-01
929      0.000000e+00
9364     0.000000e+00
191      0.000000e+00
2123     1.605487e-01
10643    1.439310e-01
1755     0.000000e+00
4588     7.455277e-02
6520     7.455277e-02
1074     0.000000e+00
4794     3.797099e-02
1706     1.439310e-01
1610     1.270324e-01
5937     1.270324e-01
1177     0.000000e+00
6895     0.000000e+00
8568     1.605487e-01
8635     9.235377e-02
1717     0.000000e+00
4582     0.000000e+00
2469     1.439310e-01
1578     1.605487e-01
11408    1.270324e-01
1433     7.455277e-02
883      1.605487e-01
2686     1.605487e-01
4704     1.270324e-01
             ...     
4194     7.455277e-02
7982     0.000000e+00
6623     0.000000e+00
8908     1.605487e-01
11597    0.000000e+00
6104     1.439310e-01
5101     0.000000e+00
10974    1.439310e-01
1428     0.000000e+00
6920     1.605487e-01
4956     1.605487e-01
4523     1.098434e-01
2212     1.605487e-01
3922     1.605487e-01
6585     9

In [81]:
cost = compute_cost(P_post,y_train,parameters)
cost



nan

**4) Calculate parameter gradients using gradient descent**

Given $\mathcal{L}(P_{post,j,i},y_i)$ calculate the gradients $\frac{d\mathcal{L}}{d\beta_0}$, $\frac{d\mathcal{L}}{d\beta}$, $\frac{d\mathcal{L}}{dw}$ using the following:

\begin{equation}
\frac{d\mathcal{L}}{d\beta} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dP_{prior}}\frac{dP_{prior}}{dP_{priorodds}}\frac{dP_{priorodds}}{d\beta}
\end{equation}

\begin{equation}
\frac{d\mathcal{L}}{dw} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dw}
\end{equation}

\begin{equation}
\frac{d\mathcal{L}}{d\beta_0} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dP_{prior}}\frac{dP_{prior}}{dP_{priorodds}}\frac{dP_{priorodds}}{d\beta_0}
\end{equation}


Where:

\begin{equation}
\frac{d\mathcal{L}}{dP_{post}} = -\log(1-y)-\log(y)
\end{equation}

\begin{equation}
\frac{dP_{post}}{dP_{prior}} = \left \{
	\begin{array}{ll}
	1-w & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	1 & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\frac{dP_{post}}{dw} = \left \{
	\begin{array}{ll}
	P_{algo,i} - P_{post,j,i} & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	0 & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\frac{dP_{prior}}{dP_{priorodds}} = P_{prior}(1-P_{prior})
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_0} = 1
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_1} = x_1
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_2} = x_2
\end{equation}

In [73]:
def backward(parameters, cache, X, Y, delta):
    """
    Arguments:
    parameters -- dictionary containing parameters 
    cache -- dictionary containing P_priorodds, P_prior, P_post, and P_algo
    X -- input data 
    Y -- labels 
    
    Returns:
    grads -- python dictionary containing gradients with respect to various parameters
    """
    # Retrieve parameters from dictionary "parameters".
    w = parameters['w']
        
    # set the number of records
    m=X_train.shape[0]
    
    # Retrieve from dictionary "cache".
    P_post = pd.Series(cache['P_post'])
    P_priorodds = cache['P_priorodds']
    P_prior = cache["P_prior"]
    P_algo = cache["P_algo"]
                
    # intermediate derivatives
    
    # dL / dPost
    dJdPost = [(P_post[i]-y_train.iloc[i])/(P_post[i]*(1-P_post[i])) for i in range(len(P_post))]
    
    # dPost / dw
    dPostdw = [P_algo.iloc[i] - P_post[i] if abs(P_prior[0][i]-P_algo.iloc[i])<=delta else float(0) for i in range(P_prior.shape[1])]
    
    # dPost / dPrior
    dPostdPrior = [1-w if abs(P_prior[0][i]-P_algo.iloc[i])<=delta else float(1) for i in range(P_prior.shape[1])]
        
    # dPrior / dPriorOdds
    dPriordPriorOdds = P_prior*np.subtract(1,P_prior)
    
    # Calculate dLdb, dLdB, dLdw 
    dJdw = np.sum(np.multiply(dJdPost,dPostdw))/m
    
    chain_rule = np.multiply(np.multiply(dJdPost,dPostdPrior),dPriordPriorOdds) # other 
    dJdB = np.dot(X.T,(1/m)*chain_rule.T)
    dJdb = (1/m)*np.sum(chain_rule)
    
    # store gradients
    grads = {"dB": dJdB,
             "dw": dJdw,
             "db": dJdb,}
    
    return grads
    

In [74]:
grads = backward(parameters, cache, X_train, y_train, delta=2)
grads

{'dB': array([[-2.99371761],
        [ 0.01025872]]), 'dw': 0.3870109938212932, 'db': -0.05144248139938843}

**5) Update parameters with gradients**

$\beta = \beta - \alpha \frac{d\mathcal{J}}{d\beta}$

$w = w - \alpha \frac{d\mathcal{J}}{dw}$

$\beta_0 = \beta_0 - \alpha \frac{d\mathcal{J}}{d\beta_0}$

In [75]:
def update_parameters(parameters, grads, learning_rate):
    """
    Arguments:
    parameters -- dictionary containing parameters 
    grads -- dictionary containing gradients 
    learning rate
    
    Returns:
    parameters -- dictionary containing updated parameters 
    """
    # Retrieve parameters from dict "parameters"
    B = parameters['B']
    b = parameters['b']
    w = parameters['w']
    
    
    # Retrieve each gradient from the dictionary "grads"
    dB = grads['dB']
    db = grads['db']
    dw = grads['dw']
    
    # Update rule for each parameter
    b = b - (learning_rate*db)
    B = B - (learning_rate*dB)
    w = w - (learning_rate*dw)
    
    parameters = {"B": B,
                  "b": b,
                  "w": w}
    
    return parameters

In [76]:
parameters = update_parameters(parameters,grads,learning_rate = 0.5)

In [77]:
parameters

{'B': array([[ 1.49915228],
        [-0.00661733]]), 'b': 0.025721240699694215, 'w': -0.1935054969106466}

In [None]:
def model(X_train, Y_train, X_test, Y_test, P_algo, delta, num_iterations = 2000, learning_rate = 0.5, print_cost = True):
    """
    Arguments:
    X_train -- training set data
    Y_train -- training labels
    X_test -- test set 
    Y_test -- test labels 
    delta -- anchoring effect threshold
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- 
    
    Returns:
    d -- dictionary containing information about model
    """
    
    cost = []
    
    # initialize parameters
    parameters = initialize_parameters(X_train.shape[1])
        
    for i in range(num_iterations):

        # Gradient descent 
        P_post, cache = forward(X_train, parameters, P_algo, delta)
        cost = compute_cost(P_post, Y_train, parameters)
        grads = backward(parameters, cache, X_train, Y_train, delta)
        parameters = update_parameters(parameters, grads, learning_rate)

        # Retrieve parameters from dictionary "parameters"
        B = parameters["B"]
        b = parameters["b"]
        w = parameters["w"]
        
        if i % 10 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
            
    return parameters

"""
    # Predict test/train set examples 
    Y_prediction_test = predict(w,b,X_test)
    Y_prediction_train = predict(w,b,X_train)

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    model_dict = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "B1" : B1,
         "B2" : B2,         
         "b" : b, 
         "w" : w,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return model_dict
"""   

In [None]:
parameters = model(X_train, y_train, X_test, y_test, P_alg_train, delta=2, num_iterations = 2000, learning_rate = 0.5)

In [None]:
parameters