## Summary
- This notebook is ran as a PoC from our causal inference model for the county and circuit level but extended to the Judge level for a try out
- Please refer the causal_finalize notebook to find description about the function created

- the process follows the below general flow:

>identify cases regarding a particular crime type within a reigon for a given set of Judges

>perform clustering on top of the cases based on gender, age and totpts features (standardized)

>perform stratified sampling on those clusters to formulate treatment vs control groups

>apply statistical testing, averaging and multiple comparison adjustment to generate results regarding if the judge shows evidence of racial bias based on their cases of certain crime type and geo-locations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from statsmodels.stats.proportion import proportions_ztest,confint_proportions_2indep

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
import scipy
import statsmodels

from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import gower

from sklearn.cluster import OPTICS, cluster_optics_dbscan
from sklearn.preprocessing import normalize, StandardScaler

from matplotlib import gridspec
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

In [2]:
df_j = pd.read_csv('circuit6_top3judges_cleaned.csv')

In [3]:
df_j.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Column2,fstnm,lstnm,age,dob,race,gender,county,...,primp,lsviol,rpviol,faviol,fips,state,pop_2010,sp_cj_total_days,totpts_cut_range,totpts_cut_group
0,3,7,7,DOMINIQUE,GAMBLE,19,1988-04-01,BLACK,MALE,pinellas,...,False,4,0,0,12103,FL,916542.0,1825,"(64.0, 74.0]",4
1,13,22,22,NICHOLAS,THOMPSON,28,1977-04-10,BLACK,MALE,pinellas,...,False,0,0,0,12103,FL,916542.0,2555,"(134.0, 144.0]",11
2,16,26,26,DANIEL,VANTRESCA,27,1977-05-19,WHITE,MALE,pasco,...,False,0,0,0,12101,FL,464697.0,2190,"(124.0, 134.0]",10
3,35,53,53,JAMES,PANTLE,30,1977-11-18,WHITE,MALE,pinellas,...,False,0,0,0,12103,FL,916542.0,1095,"(104.0, 114.0]",8
4,52,78,78,CHRISTOPHER,DOYLE,20,1984-02-04,WHITE,MALE,pasco,...,True,0,0,0,12101,FL,464697.0,0,"(22.0, 44.0]",1


In [3]:
df_j.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Column2', 'fstnm', 'lstnm', 'age', 'dob',
       'race', 'gender', 'county', 'totpts', 'judge', 'clfely', 'statut',
       'offlvl', 'offdte', 'sentdte', 'sp_total_days', 'spyrs', 'spmths',
       'spdays', 'cj_total_days', 'cc_total_days', 'pr_total_days', 'enhanc',
       'haboff', 'habvio', 'life', 'vcc', 'prrpact', 'dornum', 'circuit',
       'dispos', 'docket', 'prioff', 'ccimp', 'cjimp', 'spimp', 'primp',
       'lsviol', 'rpviol', 'faviol', 'fips', 'state', 'pop_2010',
       'sp_cj_total_days', 'totpts_cut_range', 'totpts_cut_group'],
      dtype='object')

In [4]:
df_j.shape

(154509, 48)

In [5]:
df_j.judge.value_counts()

PETERS,R        12445
QUESADA,F       12403
ANDREWS,M        8860
LEY              8054
FEDERICO         6904
                ...  
WEBBS, W.           1
FEDERICO, D.        1
LUTEN, C.           1
DOWNEU              1
BULANE              1
Name: judge, Length: 812, dtype: int64

In [6]:
df_j.circuit.value_counts()

CIRCUIT 06 - CLEARWATER    154509
Name: circuit, dtype: int64

In [7]:
df_j.statut.value_counts()

893.13(6)(A)      36831
812.014(2)(C)1    10141
322.34(5)          8713
893.13(1)(A)1      8213
810.02(4)          7342
                  ...  
787.06(3)(H)          1
812.014.2B1           1
790.17(2)(A)          1
817.02                1
828.12 2              1
Name: statut, Length: 1201, dtype: int64

In [11]:
df_predict = pd.read_csv('circuit6-3judges-893-predictions.csv')

In [12]:
df_predict.head()

Unnamed: 0,sp_cj_total_days,gender,age,race,circuit,totpts_cut_group,clfely,totpts_cut_range,judge,predicted_sentence_days,error_margin_days,crime
0,0,MALE,20,BLACK,CIRCUIT 06 - CLEARWATER,0,1ST DEGREE,"(0.0, 22.0]","ANDREWS,M",451.71692,177.6765,trafficking/893
1,0,MALE,20,BLACK,CIRCUIT 06 - CLEARWATER,0,1ST/LIFE,"(0.0, 22.0]","ANDREWS,M",451.71692,177.6765,trafficking/893
2,0,MALE,20,BLACK,CIRCUIT 06 - CLEARWATER,0,2ND DEGREE,"(0.0, 22.0]","ANDREWS,M",-4.377075,177.6765,trafficking/893
3,0,MALE,20,BLACK,CIRCUIT 06 - CLEARWATER,0,3RD DEGREE,"(0.0, 22.0]","ANDREWS,M",17.550232,177.6765,trafficking/893
4,0,MALE,20,BLACK,CIRCUIT 06 - CLEARWATER,0,LIFE,"(0.0, 22.0]","ANDREWS,M",451.71692,177.6765,trafficking/893


In [13]:
df_predict.crime.value_counts()

trafficking/893    71280
Name: crime, dtype: int64

In [5]:
def drop_error_value(df):
    df_temp = df.copy()
    df_temp = df_temp[~((df_temp['spyrs']>=99)|(df_temp['life']=='Y'))]
    df_temp['sp_cj_total_days'] = (df_temp['sp_total_days'])+(df_temp['cj_total_days'])
    df_clean = df_temp[~((df_temp['sp_cj_total_days']<=7) & (df_temp['totpts']>44))]
    
    return df_clean

In [11]:
def sp_circuit_crime_type(circuit, crime_type, df):
    match_ind = ''
    if crime_type == 'drug':     ### Expendable
        match_ind = '893.*'
    elif crime_type =='robbery':
        match_ind = '812.*'
    elif crime_type == 'burglary':
        match_ind =='810.*'
    elif crime_type == 'driving':
        match_ind == '322.*'
    df = df[df['statut'].str.match(match_ind)== True]
    
    df =df[df['circuit']== circuit]
    
    return df

In [12]:
def cluster_prep(df):
    df_temp = df.copy()
    df_temp = df_temp[(df_temp['race']=='WHITE')|(df_temp['race']=='BLACK')]
    df_temp['gender_ind']=[1 if x=='MALE' else 0 for x in df_temp['gender']]
    df_temp = df_temp.loc[:,['gender_ind','gender','age','totpts','race','sp_total_days']] 
    
    X_continous = df_temp.loc[:,['totpts','age']]
    X_cont_stand =StandardScaler().fit_transform(X_continous)
    X_normalized = normalize(X_cont_stand)
    
    df_temp['totpts_stand'] = X_normalized[:,0]
    df_temp['age_stand'] = X_normalized[:,1]
    
    return df_temp

In [13]:
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import t
import pandas as pd

def welch_ttest(x1, x2,alternative,area, crime, judge):
    
    n1 = x1.size
    n2 = x2.size
    m1 = np.mean(x1)
    m2 = np.mean(x2)
    
    v1 = np.var(x1, ddof=1)
    v2 = np.var(x2, ddof=1)
    
    pooled_se = np.sqrt(v1 / n1 + v2 / n2)
    delta = m1-m2
    
    tstat = delta /  pooled_se
    df = (v1 / n1 + v2 / n2)**2 / (v1**2 / (n1**2 * (n1 - 1)) + v2**2 / (n2**2 * (n2 - 1)))
    
    # two side t-test
    p = 2 * t.cdf(-abs(tstat), df)
    
    # upper and lower bounds
    lb = delta - t.ppf(0.975,df)*pooled_se 
    ub = delta + t.ppf(0.975,df)*pooled_se
    
    df_result = pd.DataFrame(np.array([tstat,df,p,delta,lb,ub]).reshape(1,-1),
                         columns=['T statistic','df','pvalue 2 sided','Difference in mean','lb','ub'])
    
    
    df_result['area']=area
    df_result['crime'] = crime
    df_result['judge'] = judge
    
    
    return df_result

In [14]:
def find_k(df, increment=0, decrement=0):
    """Find the optimum k clusters"""
    sse = {}
    
    for k in range(2, 21):
        kmeans = KMeans(n_clusters=k, random_state=1)
        kmeans.fit(df)
        sse[k] = kmeans.inertia_
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee + increment - decrement
    return k

In [15]:
def assign_label(df, k = -1):
    df_temp = df
    X_normalized =df_temp.loc[:,['totpts_stand','age_stand']]
    
    if k == -1:
        k_temp = find_k(X_normalized)
    else:
        k_temp = k
    
    #print("cluster number: ",k_temp)
    kmeanModel = KMeans(n_clusters=k_temp).fit(X_normalized)
    
    df_temp['Y']=kmeanModel.labels_
    
    return df_temp

In [16]:
def cluster_sampling(df, sample_rate = 0.8):
    
    clusters = dict(df['Y'].value_counts()).keys()
    
    b = []
    w = []
    
    l_sample_size = []
    new_clusters = []
    
    for i in clusters:
        d_temp = dict(df[df['Y']==i]['race'].value_counts())
        sam_size = round(min(d_temp.values())*sample_rate)
        
        if sam_size <30:
            continue
        
        new_clusters.append(i)
        l_sample_size.append(sam_size)
    
    if len(l_sample_size)==0:
        #print('not enough sample to make inference')
        
        b_df = pd.DataFrame()
        w_df = pd.DataFrame()
        return b_df,w_df
    
    min_sam_size = round(min(l_sample_size))
    
    for i in new_clusters:
        d_temp = dict(df[df['Y']==i]['race'].value_counts())
        #print(d_temp.items())
        #print('cluster',d_temp)
        
        w_temp = df[(df['Y']==i)&(df['race']=='WHITE')].sample(min_sam_size)
        b_temp = df[(df['Y']==i)&(df['race']=='BLACK')].sample(min_sam_size)
        
        b.append(b_temp)
        w.append(w_temp)
        
    
    b_df = pd.concat(b)
    w_df = pd.concat(w)
    
    return b_df,w_df

In [17]:
def cluster_sampling_unequal_strat(df, sample_rate = 0.8):
    
    clusters = dict(df['Y'].value_counts()).keys()
    
    b = []
    w = []
    
    l_sample_size = []
    new_clusters = []
    
    for i in clusters:
        d_temp = dict(df[df['Y']==i]['race'].value_counts())
        sam_size = round(min(d_temp.values())*sample_rate) 
        ### making sure gender wise, the number of samples are the same
        
        if sam_size <30:
            continue
        
        new_clusters.append(i)
        l_sample_size.append(sam_size)
    
    if len(l_sample_size)==0:
        #print('not enough sample to make inference')
        
        b_df = pd.DataFrame()
        w_df = pd.DataFrame()
        return b_df,w_df
    
    #min_sam_size = round(min(l_sample_size)) 
    ### ensuring sample extracted are the same from different clusters
    
    for i in range(len(new_clusters)):
        d_temp = dict(df[df['Y']==new_clusters[i]]['race'].value_counts())
        #print(d_temp.items())
        #print('cluster',d_temp)
        
        w_temp = df[(df['Y']==new_clusters[i])&(df['race']=='WHITE')].sample(round(l_sample_size[i]))
        b_temp = df[(df['Y']==new_clusters[i])&(df['race']=='BLACK')].sample(round(l_sample_size[i]))
        
        b.append(b_temp)
        w.append(w_temp)
        
    
    b_df = pd.concat(b)
    w_df = pd.concat(w)
    
    return b_df,w_df

# Causal results

In [18]:
df_j = pd.read_csv('circuit6_top3judges_cleaned.csv')

In [19]:
df_j = sp_circuit_crime_type('CIRCUIT 06 - CLEARWATER','drug',df_j)

In [17]:
df_j.shape

(55611, 48)

In [18]:
df_j.judge.value_counts()

FARNELL        4795
QUESADA,F      4073
PETERS,R       4044
ANDREWS,M      2915
LEY            2692
               ... 
TEPPPER, L.       1
CONVERT, T.       1
GARDNER. S        1
COVERT, B.        1
ROSARIO           1
Name: judge, Length: 468, dtype: int64

In [17]:
#df_predict.judge.value_counts()

In [20]:
result_list = []
judge_list = ['QUESADA,F','PETERS,R','ANDREWS,M']

In [21]:
for k in judge_list:
    print(k)
    j = 'CIRCUIT 06 - CLEARWATER'
    i = 'drug'
    df_data = df_j[df_j['judge']==k]
    
    print(df_data.shape[0])
    df_model = cluster_prep(df_data)

    gen_male = df_model.loc[df_model['gender_ind']==1,:].copy()
    gen_female = df_model.loc[df_model['gender_ind']==0,:].copy()

    m = assign_label(gen_male)
    f = assign_label(gen_female)
    
    for l in range(1,101):
    
        b_m,w_m = cluster_sampling(m)

        ind = 0

        if b_m.shape[0]==0 and w_m.shape[0] == 0:
            #print("not enough male samples")
            ind +=1


        b_f,w_f = cluster_sampling(f)

        if b_f.shape[0]==0 and w_f.shape[0] == 0:
            #print("not enough female samples")
            ind +=1

        if ind == 2:
            #print('not enough samples to make inference')
            continue


        treat = pd.concat([b_m,b_f])
        cont = pd.concat([w_m,w_f])

        temp_result = welch_ttest(treat.sp_total_days, cont.sp_total_days,'unequal',area = j,crime = i, judge = k)

        #print(temp_result)

        result_list.append(temp_result)

QUESADA,F
4073
PETERS,R
4044
ANDREWS,M
2915


In [22]:
result_df = pd.concat(result_list)
result_df

Unnamed: 0,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub,area,crime,judge
0,2.564690,1144.269714,0.010453,34.258944,8.050161,60.467726,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,0.221080,1163.461771,0.825069,3.761499,-29.620486,37.143484,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,0.531127,1161.719124,0.595432,8.604770,-23.181641,40.391181,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,-0.098912,1171.422250,0.921225,-1.485520,-30.951850,27.980811,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,2.417571,1113.035684,0.015784,37.289608,7.025409,67.553808,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
...,...,...,...,...,...,...,...,...,...
0,0.254318,333.857406,0.799407,15.583333,-104.950446,136.117112,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,-0.234314,273.728209,0.814916,-18.851190,-177.235671,139.533290,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,0.980845,292.768976,0.327479,74.130952,-74.615432,222.877337,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,-0.171693,333.643279,0.863783,-10.375000,-129.242288,108.492288,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"


In [23]:
from statsmodels.stats.multitest import multipletests

In [24]:
result_df_agg = pd.DataFrame(result_df.groupby(['area','crime','judge']).mean()).reset_index()

result_df_agg['if_reject'] = multipletests(result_df_agg['pvalue 2 sided'],method = 'bonferroni')[0]
result_df_agg['p_adjust'] = multipletests(result_df_agg['pvalue 2 sided'],method = 'bonferroni')[1]

In [25]:
result_df_agg

Unnamed: 0,area,crime,judge,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub,if_reject,p_adjust
0,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M",0.19324,312.382526,0.632066,10.43881,-123.378517,144.256136,False,1.0
1,CIRCUIT 06 - CLEARWATER,drug,"PETERS,R",1.343173,779.647664,0.250049,42.937543,-19.362878,105.237963,False,0.750148
2,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F",0.776935,1148.436747,0.476127,12.09138,-19.636917,43.819677,False,1.0


In [26]:
result_df_agg.to_csv('judge_level_causal_eql.csv')

In [27]:
for k in judge_list:
    print(k)
    j = 'CIRCUIT 06 - CLEARWATER'
    i = 'drug'
    df_data = df_j[df_j['judge']==k]
    
    print(df_data.shape[0])
    df_model = cluster_prep(df_data)

    gen_male = df_model.loc[df_model['gender_ind']==1,:].copy()
    gen_female = df_model.loc[df_model['gender_ind']==0,:].copy()

    m = assign_label(gen_male)
    f = assign_label(gen_female)
    
    for l in range(1,101):
    
        b_m,w_m = cluster_sampling_unequal_strat(m)

        ind = 0

        if b_m.shape[0]==0 and w_m.shape[0] == 0:
            #print("not enough male samples")
            ind +=1


        b_f,w_f = cluster_sampling_unequal_strat(f)

        if b_f.shape[0]==0 and w_f.shape[0] == 0:
            #print("not enough female samples")
            ind +=1

        if ind == 2:
            #print('not enough samples to make inference')
            continue


        treat = pd.concat([b_m,b_f])
        cont = pd.concat([w_m,w_f])

        temp_result = welch_ttest(treat.sp_total_days, cont.sp_total_days,'unequal',area = j,crime = i, judge = k)

        #print(temp_result)

        result_list.append(temp_result)

QUESADA,F
4073
PETERS,R
4044
ANDREWS,M
2915


In [28]:
result_df = pd.concat(result_list)
result_df_agg = pd.DataFrame(result_df.groupby(['area','crime','judge']).mean()).reset_index()

result_df_agg['if_reject'] = multipletests(result_df_agg['pvalue 2 sided'],method = 'bonferroni')[0]
result_df_agg['p_adjust'] = multipletests(result_df_agg['pvalue 2 sided'],method = 'bonferroni')[1]

In [29]:
result_df_agg

Unnamed: 0,area,crime,judge,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub,if_reject,p_adjust
0,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M",0.190718,422.694784,0.652019,9.929619,-113.910553,133.769791,False,1.0
1,CIRCUIT 06 - CLEARWATER,drug,"PETERS,R",1.829738,1333.602715,0.141611,45.985275,-5.833966,97.804517,False,0.424832
2,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F",0.989538,1550.926579,0.375668,13.89781,-14.850152,42.645771,False,1.0


In [30]:
result_df

Unnamed: 0,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub,area,crime,judge
0,2.564690,1144.269714,0.010453,34.258944,8.050161,60.467726,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,0.221080,1163.461771,0.825069,3.761499,-29.620486,37.143484,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,0.531127,1161.719124,0.595432,8.604770,-23.181641,40.391181,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,-0.098912,1171.422250,0.921225,-1.485520,-30.951850,27.980811,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
0,2.417571,1113.035684,0.015784,37.289608,7.025409,67.553808,CIRCUIT 06 - CLEARWATER,drug,"QUESADA,F"
...,...,...,...,...,...,...,...,...,...
0,-0.663805,499.592172,0.507121,-42.060714,-166.551534,82.430105,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,0.554627,557.210169,0.579373,27.932143,-70.990563,126.854848,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,0.219407,541.435371,0.826416,11.760714,-93.533152,117.054581,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"
0,0.453979,518.954856,0.650034,29.450000,-97.991721,156.891721,CIRCUIT 06 - CLEARWATER,drug,"ANDREWS,M"


In [31]:
result_df_agg.to_csv('judge_level_causal_uneql.csv')