In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
import random
from itertools import chain
from collections import Counter
import numpy as np
import scipy
import math

from ipynb.fs.full.GenerateData import generatedata


**0) Define variables**

$ \{x_{1}, x_{2},...x_{k}\} \in X$, Observed features of defendant
 
$\mu_{prior}$, Judge's mean prior belief about a defendant; \textit{most probable} risk prediction

$\sigma_{prior}$, Standard deviation of judge's prior belief on a defendant; corresponds to a judge's $\textit{uncertainty}$ about prediction

$\mu_{ra}$, Observed algorithmic risk assessment prediction; mean of normally-distributed perceived anchor information

$\sigma_{ra}$, Perceived anchor $\textit{influence}$; Corresponds to S.D. of risk assessment (when perceived as containing a lot of information S.D. will be lower and vice versa).

$w,c$, Parameters for mapping $\sigma_{prior}$ onto $\sigma_{ra}$

$z_{\alpha}$, $z$-score distance between $\mu_{prior}$ and $\mu_{ra}$

$z$, Threshold distance between $\mu_{prior}$ and $\mu_{ra}$ after which $\mu_{ra}$ has no (or minimal) effect on $\mu_{prior}$

$\mu_{post}$, Judge's mean posterior belief

$\sigma_{post}$, Standard deviation of judge's posterior belief 

$\tau$, Decision making threshold

$\phi_{prior}(\mu_{prior},\sigma_{prior})$, Gaussian distribution representing judge's prior belief

$\phi_{ra}(\mu_{ra},\sigma_{ra})$, Gaussian distribution representing the influence of algorithmic risk assessment predictions

$\phi_{post}(\mu_{post},\sigma_{post})$, Gaussian distribution representing the judge's posterior belief

$\Phi_{post}$, Probability of drawing a belief from the posterior greater than threshold $\tau$

$y$, Judge's observed decision on defendant
 
$\alpha$, Learning rate


**1) Initialization functions**

**(a) Parameters**

$\beta_{1}$ (nx1), $\beta_{2}$ (nx1), $var_{prior} = \sigma_{prior}^2$ (1x1), and $w$ (1x1) are randomly initialized (>0);
$b$ (1x1) and $c$ (1x1) are initialized as zero

In [3]:
def initialize_parameters(X,lower,upper):
    """
    Argument:
    X -- training data
    lower -- lower bound of scale
    upper -- upper bound of scale
    
    Returns:
    params -- python dictionary containing model parameters:
        B -- coefficient matrix
        b -- constant
        q -- constant
        mu_var -- variance of prior 
    """
    
    #np.random.seed(212) 
    
    # of parameters
    n = X.shape[1]
    
    B = np.random.rand(n,1)*0.1 # randomly initialize the B_0, B_1 coefficients
    b = 0 # intialize  constants b,c @ 0
    q = np.random.rand(1)
    var = np.random.rand(1)*10
    tau = random.randint(lower, upper)
    
    parameters = {"B": B,
                  "b": b,
                  "q" : q,
                  "var_prior" : var,
                  "tau" : tau}
    
    return parameters

In [42]:
# create dictionary to store derivatives
def initialize_dicts():
    derivatives = {
        "dL": [],
        "dL_dmuprior": [],
        "dphipost_dmupost": [],
        "dphipost_dvarpost": [],
        "dphipost_dtau": [],
        "dmupost_dmuprior": [],
        "dvarpost_dq": [],
        "dmupost_dq": [],
        "dvarpost_dvarprior": []
    }

    grads = {
        "dB":[],
        "db":[],
        "dvar_dprior":[],
        "dq":[],
        "dtau":[]
    }
    
    return derivatives,grads

**2) Forward**

**(a) Estimate mean of prior belief distribution**

Given {$\beta, b, \sigma, \theta, \tau, w$}, want to calculate loss, $\mathcal{L}(\hat{y},y_i)$ using the following steps:

1. Calculate prior mean, $\mu_{prior}$
\begin{equation}
\mu_{prior} = \beta X + b
\end{equation}

Initialize the first estimated prior mean and store in the cache. 
Future calculations will use the function calc_prior_mean

In [4]:
def calc_prior_mean(X,parameters):
    """
    Arguments:
    X -- input data on characteristics of defendant (size # obs, # features)
    parameters -- python dictionary containing initialized parameters
    
    Returns:
    cache['mu'] -- mean of judge's prior belief on defendant
    """
    
    B = parameters['B']
    b = parameters['b']
    
    # Calculate judge's prior odds  from defendant features
    mu_prior = np.dot(B.T,X.T)+b

    return mu_prior

**(b) Estimate the standard deviation of the risk assessment score (i.e. perceived confidence in the anchor info):**

**We're not currently including this in further calculations**

\begin{equation}
\sigma_{ra}^2 = var_{ra} = \left \{
	\begin{array}{ll}
        q \cdot var_{prior} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        \infty & otherwise
    \end{array}
    \right.
\end{equation}

In [5]:
def calc_var_ra(var_prior,mu_prior,mu_ra,q,theta): 
    """
    Arguments:

    Returns:
    """

    var_ra = [q*var_prior if np.abs(mu_ra[i]-mu_prior[i])>theta else float("inf") for i in range(len(mu_prior))]
    
    #cache['var_ra'] = np.array(var_ra)
    
    return var_ra 


**(c) Calculate the posterior distribution of a judge's beliefs on defendant, $\phi_{post}(\mu_{post},\sigma_{post}^2)$, and probability of making decision, $\hat{y}$:**



\begin{equation}
\phi_{prior}(\mu_{prior},\sigma_{prior}^2) = \mathcal{N}(\mu_{prior},\sigma_{prior}^2)
\end{equation}


\begin{equation}
    \phi_{ra}(\mu_{ra},\sigma_{ra}^2) = \mathcal{N}(\mu_{ra},\sigma_{ra}^2)
\end{equation}

\begin{equation}
\phi_{post}(\mu_{post},\sigma_{post}^2) =  \mathcal{N}(\mu_{post},\sigma_{post}^2)
\end{equation}

\begin{equation}
    = \frac{\phi_{prior}(\mu_{prior},\sigma_{prior}^2)\phi_{ra}(\mu_{ra},\sigma_{ra}^2)}{\int \phi_{prior}(\mu_{prior},\sigma_{prior}^2)\phi_{ra}(\mu_{ra},\sigma_{ra}^2)}
\end{equation}

where:

\begin{equation}
\mu_{post} = \left \{
	\begin{array}{ll}
        \mu_{prior} \cdot \frac{q}{q+1} + \mu_{ra} \cdot \frac{1}{q+1} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        \mu_{prior} & otherwise
    \end{array}
    \right.
\end{equation}

\begin{equation}
\sigma_{post}^2 = var_{post} = \left \{
	\begin{array}{ll}
        var_{prior} \cdot \frac{q}{q+1} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        var_{prior} & otherwise
    \end{array}
    \right.
\end{equation}

In [6]:
def calc_post_mean(mu_prior, mu_ra, q, theta):
    """
    Arguments:

    Returns:
    mean -- posterior belief distibution mean
    """

    mu_post = [(mu_prior[i]*(q/(q+1)))+(mu_ra[i]/(q+1)) if np.abs(mu_ra[i]-mu_prior[i])>theta else mu_prior[i] for i in range(len(mu_prior))]

    return mu_post

In [7]:
def calc_post_var(mu_prior, mu_ra, var_prior, q, theta):
    """
    Arguments:
    sigma -- prior standard deviation
    lmbda -- standard deviation of perceived risk assessment
    
    Returns:
    sd -- posterior belief distribution standard deviation
    """
    
    var_post = [var_prior*(q/(q+1)) if np.abs(mu_ra[i]-mu_prior[i])>theta else var_prior for i in range(len(mu_prior))]

    return var_post

**(d) Calculate the probability of detaining defendant as area under the posterior belief curve $\geq$ decision threshold, $\tau$**

\begin{equation}
\Phi(\tau;\mu_{post},\sigma_{post}) = \int_\tau^{\infty} \phi_{post}(\tau; \mu_{post},\sigma_{post}^2)
\end{equation}


In [41]:
def calc_Phi(mu_post,var_post,tau):    
    """
    Arguments:
    mu -- prior mean
    sigma -- prior standard deviation
    risk_assessment -- algorithmic risk assessment prediction
    lmbda -- standard deviation of perceived risk assessment
    tau -- some threshold above which judge's will detain a defendant
    
    Returns:
    y -- probability of drawing from the region [tau,10]
    """

    Phi = []
    for i in range(len(mu_post)):
        posterior = scipy.stats.norm(loc=mu_post[i],scale=np.sqrt(var_post[i]))
        Phi.append(1 - posterior.cdf(tau))
    
    # some pred_y = 0 or 1 due to rounding in pythong, which produces an error when take log; add a tiny little number to get around this.
    Phi = [1e-16 if i==0 else i for i in Phi]
    Phi = np.array([1-1e-16 if i==1 else i for i in Phi])

    return Phi

**(e) Compute negative log likelihood**

We are going to penalize $\mu_{prior}$ if it goes less than 1.

\begin{equation}
g(1-\mu_{prior}) = \left \{
	\begin{array}{ll}
        0 & if 1 - \mu_{prior}<0 \\
        k(1-\mu_{prior})^3 & if 1-\mu_{prior}\geq0
    \end{array}
    \right.
\end{equation}

\begin{equation}
h(10-\mu_{prior}) = \left \{
	\begin{array}{ll}
        0 & if \mu_{prior}-10 < 0 \\
        k(\mu_{prior}-10)^3 & if \mu_{prior}-10\geq0
    \end{array}
    \right.
\end{equation}

\begin{equation}
\mathcal{L}(\Phi(\tau;\mu_{post},\sigma_{post}^2), y_i) = y_i \log (1-\Phi(\tau;\mu_{post},\sigma_{post}^2)) + (1-y_i) \log \Phi(\tau;\mu_{post},\sigma_{post}^2) + g(1-\mu_{prior}) + h(\mu_{prior}-10)
\end{equation}


In [43]:
def calc_L(Phi,y,mu_prior,k):
    """
    Arguments:
    pred_y -- predicted probability of drawing a belief from the posterior distribution > tau
    true_y -- known decision by judge on defendant 
    
    Returns:
    L(y_pred,y_true) -- negative log likelihood
    """
    y = np.array(y)
    L=[]
    
    # penalize both 1-mu_prior >= 0 & mu_prior-10 >= 0
    for i in range(len(mu_prior)):
        if 1 - mu_prior[i]<0: # this is for when mu_prior < 1
            g = 0
        else:
            g = k*((1-mu_prior[i])**3)

        if mu_prior[i] - 10 < 0: # this is for when mu_prior > 10
            h = 0
        else:
            h = k*((mu_prior[i]-10)**3)
            
        nLL = -(y[i]*np.log(1-Phi[i])+(1-y[i])*np.log(Phi[i]))
        
        L.append(nLL+g+h)
    
    # penalize 1-mu_prior >= 0
    #L = [-(y[i]*np.log(1-Phi[i])+(1-y[i])*np.log(Phi[i])) if 1-mu_prior[i]<0 else -(y[i]*np.log(1-Phi[i])+(1-y[i])*np.log(Phi[i]))+(k*((1-mu_prior[i])**3)) for i in range(len(mu_prior))]
    
    # no penalties
    #L = -((np.array(y)*np.log(1-np.array(Phi)))+(1-np.array(y))*np.log(np.array(Phi)))
    
    return L

**4) Calculate derivatives for gradient descent**

Given $\mathcal{L}(\Phi(\tau;\mu_{post},var_{post}^2),y_i)$ calculate the gradients $\frac{d\mathcal{L}}{d\beta}$, $\frac{d\mathcal{L}}{db}$, $\frac{d\mathcal{L}}{dvar_{prior}}$, $\frac{d\mathcal{L}}{dq}$,$\frac{d\mathcal{L}}{d\tau}$ using the following:

\begin{equation}
    d\mathcal{L} = \phi_{post}\left((1-y)\cdot\frac{\phi_{post}}{\Phi} - y\cdot\frac{\phi_{post}}{1-\Phi} \right)
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{d\mu_{post}} = \frac{\tau-\mu_{post}}{var_{post}^{3/2}\sqrt{2\pi}}\cdot exp \left( \frac{(\tau-\mu_{post})^2}{2\cdot var_{post}} \right)
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{dvar_{post}} = \left(\frac{(\tau-\mu_{post})^2}{2 \cdot var_{post}^{5/2} \cdot \sqrt{2\pi}} - \frac{1}{2 \cdot var_{post}^{3/2} \cdot \sqrt{2\pi}} \right) \cdot exp \left[- \frac{(\tau-\mu_{post})^2}{2\cdot var_{post}} \right]
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{d\tau}= -\frac{\tau-\mu_{post}}{var_{post}^{3/2} \sqrt{2\pi}}exp\left[-\frac{(\tau-\mu_{post})^2}{2var_{post}} \right]
\end{equation}

\begin{equation}
    \frac{d\mu_{post}}{d\mu_{prior}} = 
    \left \{
	\begin{array}{ll}
        \frac{q}{q+1} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        1 & otherwise
    \end{array}
    \right.
\end{equation}

\begin{equation}
    \frac{d\mu_{post}}{dq} = 
        \left \{
	\begin{array}{ll}
        \frac{\mu_{prior}}{1+q} - \frac{\mu_{ra}}{(1+q)^2}-\frac{q\cdot\mu_{prior}}{(1+q)^2} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        0 & otherwise
    \end{array}
    \right.
\end{equation}        

\begin{equation}
    \frac{dvar_{post}}{dq} = 
    \left \{
	\begin{array}{ll}
        \frac{var_{prior}}{q+1} - \frac{q \cdot var_{prior}}{(q+1)^2} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        0 & otherwise
    \end{array}
    \right.
\end{equation}

\begin{equation}
    \frac{dvar_{post}}{dvar_{prior}} = 
        \left \{
	\begin{array}{ll}
        \frac{q}{q+1} & if |\mu_{ra} - \mu_{prior}|<\Theta \\
        1 & otherwise
    \end{array}
    \right.
\end{equation}

\begin{equation}
    if 1-\mu_{prior} \geq 0: \frac{d\mathcal{L}}{d\mu_{prior}} = -3k(1-\mu_{prior})^2
\end{equation}

\begin{equation}
    if \mu_{prior}-10 \geq 0: \frac{d\mathcal{L}}{d\mu_{prior}} = 3k(\mu_{prior}-10)^2
\end{equation}

\begin{equation}
    else: \frac{d\mathcal{L}}{d\mu_{prior}} = 0
\end{equation}

In [37]:
def calc_component_derivs(X, y, parameters, derivatives, theta, mu_prior,mu_ra,mu_post,var_post,Phi,k): 
    
    # retrieve variables
    mu_ra = np.array(mu_ra)
    B = parameters['B']
    b = parameters['b']
    q = parameters['q']
    var_prior = parameters['var_prior']
    tau = parameters['tau']

    # Calculate Derivatives / Partial Derivatives

# dL
    post_normal = scipy.stats.norm(mu_post,np.sqrt(var_post))
    phi_post = post_normal.pdf(tau)
    #phi_post = (1/(np.sqrt(2*math.pi*var_post)))*np.exp(-(((tau-mu_post)**2)/(2*var_post)))
    
    derivatives['dL'] = phi_post*((np.array(1-y)*(phi_post/Phi))-(np.array(y)*(phi_post/(1-Phi))))

# dphi_post / dmu_post
    var_post_exp = [vp**(3/2) for vp in var_post]
    p1 = np.subtract(tau,mu_post)/np.multiply(var_post_exp,((2*math.pi)**(1/2)))
    p2 = -(np.subtract(tau,mu_post)**2 / np.multiply(2,var_post))
    derivatives['dphipost_dmupost'] = p1*np.exp(p2.astype(float))
    
# dphi_post / dvar_post
    var_post_exp_1 = [vp**(5/2) for vp in var_post]
    p1 = (np.subtract(tau,mu_post)**2)/np.multiply(np.multiply(2,var_post_exp_1),((2*math.pi)**(1/2)))
    var_post_exp_2 = [vp**(3/2) for vp in var_post]
    p2 = 1/(np.multiply(2,var_post_exp_2)*((2*math.pi)**(1/2)))
    p3 = (-((np.subtract(tau,mu_post)**2)/np.multiply(2,var_post)))
    derivatives['dphipost_dvarpost'] = (p1-p2)*np.exp(p3.astype(float))
    
#dphi_post / dtau
    var_post_exp = [vp**(3/2) for vp in var_post]
    p1 = -(np.subtract(tau,mu_post)/(var_post_exp)*((2*math.pi)**(1/2)))
    p2 = -((np.subtract(tau,mu_post)**2)/np.multiply(2,var_post))
    derivatives['dphipost_dtau'] = p1*np.exp(p2)
           
# dL_dmuprior
    derivatives['dL_dmuprior']=[]
    for i in range(len(mu_prior)):
        if 1-mu_prior[i]>=0:
            derivatives['dL_dmuprior'].append(-3*k*((1-mu_prior[i])**2))
        elif mu_prior[i]-10>=0:
            derivatives['dL_dmuprior'].append(3*k*((mu_prior[i]-10)**2))
        else:
            derivatives['dL_dmuprior'].append(0)
    derivatives['dL_dmuprior']=np.array(derivatives['dL_dmuprior'])
        
# dmu_prior / dB
    #derivatives['dmuprior_dB'] = X

# dmu_prior / db
    #derivatives['dmuprior_db'] = 1
    
    dmupost_dmuprior = []
    dvarpost_dq = []
    dmupost_dq = []
    dvarpost_dvarprior = []
    
    for i in range(len(mu_prior)):
        if np.abs(mu_prior[i] - mu_ra[i]) < theta:

            # dmu_post/dmu_prior
            dmupost_dmuprior.append((q/(q+1))[0])

            #dvarpost_dq
            dvarpost_dq.append(((var_prior/(q+1))-((q*var_prior)/((q+1)**2)))[0])

            #dmupost_dq
            dmupost_dq.append(((mu_prior[i]/(q+1)) - (mu_ra[i]/((q+1)**2)) - ((mu_prior[i]*q)/((q+1)**2)))[0])

            #dvarpost_dvarprior
            dvarpost_dvarprior.append(0)

        else:

            # dmu_post/dmu_prior
            dmupost_dmuprior.append(1)

            #dvarpost_dq
            dvarpost_dq.append(0)

            #dmupost_dq
            dmupost_dq.append(0)

            #dvarpost_dvarprior
            dvarpost_dvarprior.append(1)
        
        derivatives['dmupost_dmuprior'] = np.array(dmupost_dmuprior)
        derivatives['dvarpost_dq'] = np.array(dvarpost_dq)
        derivatives['dmupost_dq'] = np.array(dmupost_dq)
        derivatives['dvarpost_dvarprior'] = np.array(dvarpost_dvarprior)
    
    return derivatives

**5) Use chain rule to calculate gradients**

\begin{equation}
    \frac{d\mathcal{L}}{d\beta} = d\mathcal{L}\cdot\frac{\partial\phi_{post}}{\partial\mu_{post}}\cdot \frac{\partial\mu_{post}}{\partial\mu_{prior}}\cdot \frac{d\mu_{prior}}{d\beta} + \frac{d\mathcal{L}}{d\mu_{prior}}\cdot \frac{d\mu_{prior}}{d\beta}
\end{equation}

\begin{equation}
    \frac{d\mathcal{L}}{db} = d\mathcal{L}\cdot\frac{\partial\phi_{post}}{\partial\mu_{post}}\cdot \frac{\partial\mu_{post}}{\partial\mu_{prior}}\cdot \frac{d\mu_{prior}}{db} + \frac{d\mathcal{L}}{d\mu_{prior}}\cdot \frac{d\mu_{prior}}{db}
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{dvar_{prior}} = \frac{d\phi_{post}}{dvar_{post}} \cdot \frac{dvar_{post}}{dvar_{prior}} 
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{dq} = \frac{d\phi_{post}}{dvar_{post}} \cdot \frac{dvar_{post}}{dq} + \frac{d\phi_{post}}{d\mu_{post}} \cdot \frac{d\mu_{post}}{dq} 
\end{equation}

\begin{equation}
    \frac{d\phi_{post}}{d\tau}
\end{equation}

In [39]:
def calc_gradients(grads, derivatives, mu_prior, X): 
# Retrieve variables
    dL = derivatives['dL']
    dL_dmuprior = derivatives['dL_dmuprior']
    dphipost_dmupost = derivatives['dphipost_dmupost']
    dphipost_dvarpost = derivatives['dphipost_dvarpost']
    dphipost_dtau = derivatives['dphipost_dtau']
    dmupost_dmuprior = derivatives['dmupost_dmuprior']
    dvarpost_dq = derivatives['dvarpost_dq']
    dmupost_dq = derivatives['dmupost_dq']
    dvarpost_dvarprior = derivatives['dvarpost_dvarprior']

# dL_dB
    dldB =[dL[i]*dphipost_dmupost[i]*dmupost_dmuprior[i]+dL_dmuprior[i] for i in range(len(mu_prior))]
    dL_dB = np.dot(dldB,X)
    grads['dB'] = dL_dB

# dL_db
    dldb =[dL[i]*dphipost_dmupost[i]*dmupost_dmuprior[i]+dL_dmuprior[i] for i in range(len(mu_prior))]
    dL_db = np.sum(dldb)
    grads['db'] = dL_db

#dL_dvarprior
    dL_dvarprior = np.sum(dL*dphipost_dvarpost*dvarpost_dvarprior)         
    grads['dvar_dprior'] = dL_dvarprior

# dL_dq
    dL_dq = np.sum(dL*((dphipost_dvarpost*dvarpost_dq)+(dphipost_dmupost*dmupost_dq)))
    grads['dq'] = dL_dq

# dL_dtau
    dL_dtau = np.sum(dL*dphipost_dtau)
    grads['dtau'] = dL_dtau    
    
    return grads

**8) Update parameters with gradients**

\begin{equation}
\beta' = \beta - \alpha \frac{d\mathcal{L}}{d\beta} \\
b' = b - \alpha \frac{d\mathcal{L}}{db} \\
q' = q - \alpha \frac{d\mathcal{L}}{dq} \\
var_{prior}' = var_{prior}-\alpha\frac{d\mathcal{L}}{dvar_{prior}} \\
\tau' = \tau - \alpha \frac{dL}{d\tau}
\end{equation}

In [12]:
def update_parameters(parameters, grads, learning_rate):
    """
    Arguments:
    parameters -- dictionary containing parameters 
    grads -- dictionary containing gradients 
    learning rate
    
    Returns:
    parameters -- dictionary containing updated parameters 
    """
    B = parameters['B']
    b = parameters['b']
    q = parameters['q']
    var_prior = parameters['var_prior']
    tau = parameters['tau']
    
    dB = np.array([i for i in grads['dB']])
    db = grads['db']
    dq = grads['dq']
    dvar_dprior = grads['dvar_dprior']
    dtau = grads['dtau']
    
    parameters['B'] = np.array([B[i]-(dB[i]*learning_rate) for i in range(B.shape[0])])
    parameters['b'] = b-(learning_rate*db)
    parameters['q'] = q-(learning_rate*dq)
    parameters['var_prior'] = var_prior-(learning_rate*dvar_dprior)
    parameters['tau'] = tau-(learning_rate*dtau)
    
    return parameters