In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
import random
from itertools import chain
from collections import Counter
import numpy as np
import scipy

from ipynb.fs.full.GenerateData import generatedata


**0) Define variables**


$i$  Individual defendants
 
$x_{1,i}...x_{n,i} \in X_{i}$  Features of individual defendants
 
$j$  Individual judges
 
$\Theta$  Space of possible predictions made by judge on defendant
 
$\mu$  Mean prior predictoin on defendant by judge; most probable prediction
 
$\sigma$  Standard deviation of prior predictions on a defenant by a judge; represents a judge's. uncertainty about prediction
 
$\alpha$  Algorithmic risk assessment prediction; mean of normappy-distributed perceived anchor information

$\lambda$  Perceived anchor influence; Corresponds to S.D. of risk assessment (when perceived as containing a lot of information S.D. will be lower, and vice versa)

$w, c$  Parameters of $\lambda$ when mapping onto $z_{\alpha}$
  
$z_{\alpha}$  $z$-score to measure distance between $\mu$ and $\alpha$ 

$z$  Threshold distance between $\mu$ and $\alpha$ after which $\alpha$ has no or minimal effect on $\mu$

$\tau$  Decision-making threshold

$y_1$ An estimate of a judge's posterior belief on a defendant where $y_1$ is drawn from $Post(\mu, \sigma, \alpha, \lambda)$

$\hat{y}$  Probability of drawing a posterior greater than threshold $\tau$

**1) Prepare the data for analysis**

**(a) Load data**

In [2]:
data = generatedata()

In [3]:
data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,r_charge_desc,r_jail_in,r_jail_out,is_violent_recid,num_vr_cases,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,release
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,,0,,,,,,1
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,,0,,,,,,0
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,Felony Battery (Dom Strang),,,1,,13009779CF10A,(F3),2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),1
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,Driving Under The Influence,2013-06-16 09:05:47.000000,2013-06-16 07:18:55.000000,0,,,,,,0
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,,0,,,,,,0


In [4]:
data.columns

Index(['index', 'id', 'name', 'first', 'last', 'sex', 'race', 'dob', 'age',
       'age_cat', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'compas_screening_date', 'decile_score', 'score_text', 'violent_recid',
       'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
       'c_case_number', 'c_days_from_compas', 'c_arrest_date',
       'c_offense_date', 'c_charge_degree', 'c_charge_desc', 'is_recid',
       'num_r_cases', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'release'],
      dtype='object')

In [12]:
#casenum = data.c_case_number.unique()
#f = open("unique_casenumbers.csv", "w")
#f.write(casenum)

In [8]:
# check how many released/remanded
data.release.value_counts()

1    6760
0    4982
Name: release, dtype: int64

In [9]:
# check how many released/remanded by decile_score
pd.crosstab(data.decile_score, data.release)

release,0,1
decile_score,Unnamed: 1_level_1,Unnamed: 2_level_1
1,266,2311
2,160,1412
3,127,1132
4,581,618
5,522,512
6,524,469
7,826,74
8,714,82
9,708,94
10,554,56


**(b) Create dummy variables for categorical variables**

In [10]:
# create dummy variables
sex_dummies = pd.get_dummies(data['sex'])
race_dummies = pd.get_dummies(data['race'])

data['sex_1_male'] = sex_dummies['Male']
data['African_American']=race_dummies['African-American']
data['Asian']=race_dummies['Asian']
data['Caucasian']=race_dummies['Caucasian']
data['Hispanic'] = race_dummies['Hispanic']
data['Native_American']=race_dummies['Native American']
data['Other']=race_dummies['Other']

data.head()

Unnamed: 0,index,id,name,first,last,sex,race,dob,age,age_cat,...,vr_offense_date,vr_charge_desc,release,sex_1_male,African_American,Asian,Caucasian,Hispanic,Native_American,Other
0,0,1,miguel hernandez,miguel,hernandez,Male,Other,1947-04-18 00:00:00.000000,69,Greater than 45,...,,,1,1,0,0,0,0,0,1
1,1,2,michael ryan,michael,ryan,Male,Caucasian,1985-02-06 00:00:00.000000,31,25 - 45,...,,,0,1,0,0,1,0,0,0
2,2,3,kevon dixon,kevon,dixon,Male,African-American,1982-01-22 00:00:00.000000,34,25 - 45,...,2013-07-05 00:00:00.000000,Felony Battery (Dom Strang),1,1,1,0,0,0,0,0
3,3,4,ed philo,ed,philo,Male,African-American,1991-05-14 00:00:00.000000,24,Less than 25,...,,,0,1,1,0,0,0,0,0
4,4,5,marcu brown,marcu,brown,Male,African-American,1993-01-21 00:00:00.000000,23,Less than 25,...,,,0,1,1,0,0,0,0,0


**(c) Split data into training & test**

In [11]:
# select out relevant data & generate the train and test data

data = data[['release','decile_score','age','is_violent_recid']] # NOTE::: This is just a subset of features we would examine in reality

# train / test
X = data.drop(['release'],axis=1)
y = data['release']
class_names = data.release.unique()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=212)

R_train = X_train['decile_score']/10
R_test = X_test['decile_score']/10

X_train = X_train.drop(['decile_score'],axis=1)
X_test = X_test.drop(['decile_score'],axis=1)

**2) Helper functions**

**(a) Initialize parameters**

$\beta_{1}$, $\beta_{2}$, and $w$ are randomly initialized (>0);
$b$ and $c$ are initialized as zero

In [22]:
def initialize_parameters(num_params):
    """
    Argument:
    num_params -- the number of parameters in the training data
    
    Returns:
    params -- python dictionary containing model parameters:
    B -- coefficient matrix
    b -- constant
    w -- constant
    c -- constant
    """
    
    np.random.seed(212) 
    
    B = np.random.randn(num_params,1)*0.1 # randomly initialize the B_0, B_1 coefficients
    w = random.random() # randomly initialize scaling factor w > 0
    b = 0 # intialize  constants b,c @ 0
    c = 0

    
    parameters = {"B": B,
                  "b": b,
                  "w" : w,
                  "c" : c}
    
    return parameters

In [23]:
parameters = initialize_parameters(2)
parameters

{'B': array([[ 0.02293469],
        [-0.0148797 ]]), 'b': 0, 'w': 0.018942174213667284, 'c': 0}

**3) Calculate loss**

Given a set {$\beta_0, \beta_1, \beta_2, w$}, want to calculate loss, $\mathcal{L}(P_{post,j,i},y_i)$ using the following steps:

\begin{equation}
P_{priorodds,j,i} = \beta_{0,j}+\beta_{1,j}x_{1,i}+\beta_{2,j}x_{2,j}
\end{equation}

\begin{equation}
P_{prior,j,i} = logit(P_{priorodds,j,i})
\end{equation}

\begin{equation}
P_{algodds,i} = A_{0,j}+A_{1,j}R_{i}
\end{equation}

\begin{equation}
P_{algo,i} = logit(P_{algodds,i})
\end{equation}

\begin{equation}
P_{post,j,i} = \left \{
	\begin{array}{ll}
	wP_{algo,i}+(1-w)P_{prior,j,i} & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	P_{prior,j,i} & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\mathcal{L}(P_{post,j,i}, y_i) = - y_i \log P_{post,j,i} + (1-y_i) \log (1-P_{post,j,i})
\end{equation}

\begin{equation}
J_{\{\beta_0,\beta_1,\beta_2,w\}} = \frac{1}{I_j}\sum \mathcal{L}(P_{post,j,i},y_i)
\end{equation}

In [66]:
def calculate_predictions(X, parameters, P_algo, delta):
    """
    Argument:
    X -- input data of size (# obs, # params)
    parameters -- python dictionary containing your parameters (output of initialization function)
    delta -- threshold for difference betwen P_prior & P_algo
    
    Returns:
    P_post -- The sigmoid output 
    cache -- a dictionary containing P_priorodds,P_prior,P_post,P_alg_train
    """
    # Retrieve each parameter from the dictionary "parameters"
    A = parameters['A']
    a = parameters['a']
    B = parameters['B']
    b = parameters['b']
    w = parameters['w']
    
    X = X.to_numpy()
    
    # Calculate judge's prior odds  from defendant features
    P_priorodds = np.dot(B.T,X.T)+b
    
    # Transform the log odds to probabilities using logit
    P_prior = logit(P_priorodds) 

    # Scale the judge's prior wrt algorithmic risk assessment 
    P_post = [(w*P_algo.iloc[i]) + ((1-w)*P_prior[0][i])
              if abs(P_prior[0][i]-P_algo.iloc[i]) <= delta
              else P_prior[0][i]
              for i in range(P_prior.shape[1])]
  
    cache = {"P_priorodds": P_priorodds,
             "P_prior": P_prior,
             "P_post": P_post,
             "P_algo": P_algo}
    
    return P_post, cache

In [112]:
def compute_loss(P_post, Y):
    """   
    Arguments:
    P_post -- The scaled sigmoid output (probability)
    Y -- Decision labels
    
    Returns:
    loss
    
    """
    # If P_post < 0 or P_post > 1 then we will assert a heavy penalty, otherwise nothing

    loss=[(-y*np.log(p))+(1-y)*np.log(1-p)+9999 
          if p<0 or p>1 
          else (-y*np.log(p))+(1-Y)*np.log(1-p) 
          for p,y in zip(P_post,Y)]
    
    
    return loss

In [113]:
loss = compute_loss(P_post,y_train)
loss

[1434     0.674721
 4425     0.674721
 37       0.674721
 19       0.674721
 929     -0.037199
 9364    -0.037199
 191      0.674721
 2123     0.674721
 10643    0.674721
 1755     0.674721
 4588     0.674721
 6520     0.674721
 1074    -0.037199
 4794    -0.037199
 1706     0.674721
 1610     0.674721
 5937     0.674721
 1177     0.674721
 6895    -0.037199
 8568    -0.037199
 8635     0.674721
 1717    -0.037199
 4582    -0.037199
 2469     0.674721
 1578     0.674721
 11408    0.674721
 1433     0.674721
 883      0.674721
 2686    -0.037199
 4704     0.674721
            ...   
 4194     0.674721
 7982    -0.037199
 6623     0.674721
 8908     0.674721
 11597   -0.037199
 6104     0.674721
 5101    -0.037199
 10974    0.674721
 1428    -0.037199
 6920     0.674721
 4956     0.674721
 4523     0.674721
 2212     0.674721
 3922     0.674721
 6585    -0.037199
 11368   -0.037199
 7679     0.674721
 7443    -0.037199
 2292     0.674721
 8407     0.674721
 1299    -0.037199
 8958    -0.

**4) Calculate parameter gradients using gradient descent**

Given $\mathcal{L}(P_{post,j,i},y_i)$ calculate the gradients $\frac{d\mathcal{L}}{d\beta_0}$, $\frac{d\mathcal{L}}{d\beta}$, $\frac{d\mathcal{L}}{dw}$ using the following:

\begin{equation}
\frac{d\mathcal{L}}{d\beta} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dP_{prior}}\frac{dP_{prior}}{dP_{priorodds}}\frac{dP_{priorodds}}{d\beta}
\end{equation}

\begin{equation}
\frac{d\mathcal{L}}{dw} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dw}
\end{equation}

\begin{equation}
\frac{d\mathcal{L}}{d\beta_0} = \frac{d\mathcal{L}}{dP_{post}}\frac{dP_{post}}{dP_{prior}}\frac{dP_{prior}}{dP_{priorodds}}\frac{dP_{priorodds}}{d\beta_0}
\end{equation}


Where:

\begin{equation}
\frac{d\mathcal{L}}{dP_{post}} = -\log(1-y)-\log(y)
\end{equation}

\begin{equation}
\frac{dP_{post}}{dP_{prior}} = \left \{
	\begin{array}{ll}
	1-w & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	1 & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\frac{dP_{post}}{dw} = \left \{
	\begin{array}{ll}
	P_{algo,i} - P_{post,j,i} & |P_{prior,j,i}-P_{algo,i}| \leq \delta \\
	0 & otherwise
        \end{array}
    \right.
\end{equation}

\begin{equation}
\frac{dP_{prior}}{dP_{priorodds}} = P_{prior}(1-P_{prior})
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_0} = 1
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_1} = x_1
\end{equation}

\begin{equation}
\frac{dP_{priorodds}}{d\beta_2} = x_2
\end{equation}

In [70]:
def calculate_coefficients(parameters, cache, X, Y, delta):
    """
    Arguments:
    parameters -- dictionary containing parameters 
    cache -- dictionary containing P_priorodds, P_prior, P_post, and P_algo
    X -- input data 
    Y -- labels 
    
    Returns:
    grads -- python dictionary containing gradients with respect to various parameters
    """
    # Retrieve parameters from dictionary "parameters".
    w = parameters['w']
        
    # set the number of records
    m=X_train.shape[0]
    
    # Retrieve from dictionary "cache".
    P_post = pd.Series(cache['P_post'])
    P_priorodds = cache['P_priorodds']
    P_prior = cache["P_prior"]
    P_algo = cache["P_algo"]
                
    # intermediate derivatives
    
    # dL / dPost
    dLdPost = [(P_post[i]-y_train.iloc[i])/(P_post[i]*(1-P_post[i])) for i in range(len(P_post))]
    
    # dPost / dw
    dPostdw = [P_algo.iloc[i] - P_post[i] if abs(P_prior[0][i]-P_algo.iloc[i])<=delta else float(0) for i in range(P_prior.shape[1])]
    
    # dPost / dPrior
    dPostdPrior = [1-w if abs(P_prior[0][i]-P_algo.iloc[i])<=delta else float(1) for i in range(P_prior.shape[1])]
        
    # dPrior / dPriorOdds
    dPriordPriorOdds = P_prior*np.subtract(1,P_prior)
    
    # Calculate dLdb, dLdB, dLdw
    chain_rule = np.multiply(np.multiply(dLdPost,dPostdPrior),dPriordPriorOdds) # other 
    dLdB = np.dot(X.T,chain_rule.T)
    dLdb = np.sum(chain_rule)
    dLdw = np.sum(np.multiply(dLdPost,dPostdw))
    
    # store gradients
    grads = {"dB": dLdB,
             "dw": dLdw,
             "db": dLdb,}
    
    return grads
    

In [71]:
grads = calculate_coefficients(parameters, cache, X_train, y_train, delta=2)
grads

{'dB': array([[-31179.01404185],
        [    71.22798982]]),
 'dw': 3373.1677113064743,
 'db': -586.0719605391747}

**5) Update parameters with gradients**

$\beta = \beta - \alpha \frac{d\mathcal{J}}{d\beta}$

$w = w - \alpha \frac{d\mathcal{J}}{dw}$

$\beta_0 = \beta_0 - \alpha \frac{d\mathcal{J}}{d\beta_0}$

In [77]:
def update_parameters(parameters, cache, grads, learning_rate):
    """
    Arguments:
    parameters -- dictionary containing parameters 
    grads -- dictionary containing gradients 
    learning rate
    
    Returns:
    parameters -- dictionary containing updated parameters 
    """
    # Retrieve parameters from dict "parameters"
    B = parameters['B']
    b = parameters['b']
    w = parameters['w']
    
    
    # Retrieve each gradient from the dictionary "grads"
    dB = grads['dB']
    db = grads['db']
    dw = grads['dw']
    
    # Update rule for each parameter
    b = b - (learning_rate*db)
    B = B - (learning_rate*dB)
    w = w - (learning_rate*dw)
    
    return parameters

parameters

In [79]:
test = update_parameters(parameters, cache, grads, learning_rate=0.1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
parameters

In [None]:
def model(X_train, Y_train, X_test, Y_test, P_algo, delta, num_iterations = 2000, learning_rate = 0.5, print_cost = True):
    """
    Arguments:
    X_train -- training set data
    Y_train -- training labels
    X_test -- test set 
    Y_test -- test labels 
    delta -- anchoring effect threshold
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- 
    
    Returns:
    d -- dictionary containing information about model
    """
    
    cost = []
    
    # initialize parameters
    parameters = initialize_parameters(X_train.shape[1])
        
    for i in range(num_iterations):

        # Gradient descent 
        P_post, cache = forward(X_train, parameters, P_algo, delta)
        cost = compute_cost(P_post, Y_train, parameters)
        grads = backward(parameters, cache, X_train, Y_train, delta)
        parameters = update_parameters(parameters, grads, learning_rate)

        # Retrieve parameters from dictionary "parameters"
        B = parameters["B"]
        b = parameters["b"]
        w = parameters["w"]
        
        if i % 10 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
            
    return parameters

"""
    # Predict test/train set examples 
    Y_prediction_test = predict(w,b,X_test)
    Y_prediction_train = predict(w,b,X_train)

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    
    model_dict = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "B1" : B1,
         "B2" : B2,         
         "b" : b, 
         "w" : w,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return model_dict
"""   

In [None]:
parameters = model(X_train, y_train, X_test, y_test, P_alg_train, delta=2, num_iterations = 2000, learning_rate = 0.5)

In [None]:
parameters