In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from statsmodels.stats.proportion import proportions_ztest,confint_proportions_2indep

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
import scipy
import statsmodels

from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import gower

from sklearn.cluster import OPTICS, cluster_optics_dbscan
from sklearn.preprocessing import normalize, StandardScaler

from matplotlib import gridspec
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

In [2]:
def drop_error_value(df):
    df_temp = df.copy()
    df_temp = df_temp[~((df_temp['spyrs']>=99)|(df_temp['life']=='Y'))]
    df_temp['sp_cj_total_days'] = (df_temp['sp_total_days'])+(df_temp['cj_total_days'])
    df_clean = df_temp[~((df_temp['sp_cj_total_days']<=7) & (df_temp['totpts']>44))]
    
    return df_clean

In [3]:
def sp_county_crime_type(county, crime_type, df):
    match_ind = ''
    if crime_type == 'drug':     ### Expendable
        match_ind = '893.*'
    elif crime_type =='robbery':
        match_ind = '812.*'
    df = df[df['statut'].str.match(match_ind)== True]
    
    df =df[df['county']== county]
    
    return df

In [84]:
def cluster_prep(df):
    df_temp = df.copy()
    df_temp = df_temp[(df_temp['race']=='WHITE')|(df_temp['race']=='BLACK')]
    df_temp['gender_ind']=[1 if x=='MALE' else 0 for x in df_temp['gender']]
    df_temp = df_temp.loc[:,['gender_ind','gender','age','totpts','race','sp_total_days']] 
    
    X_continous = df_temp.loc[:,['totpts','age']]
    X_cont_stand =StandardScaler().fit_transform(X_continous)
    X_normalized = normalize(X_cont_stand)
    
    df_temp['totpts_stand'] = X_normalized[:,0]
    df_temp['age_stand'] = X_normalized[:,1]
    
    return df_temp

In [5]:
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import t
import pandas as pd

def welch_ttest(x1, x2,alternative):
    
    n1 = x1.size
    n2 = x2.size
    m1 = np.mean(x1)
    m2 = np.mean(x2)
    
    v1 = np.var(x1, ddof=1)
    v2 = np.var(x2, ddof=1)
    
    pooled_se = np.sqrt(v1 / n1 + v2 / n2)
    delta = m1-m2
    
    tstat = delta /  pooled_se
    df = (v1 / n1 + v2 / n2)**2 / (v1**2 / (n1**2 * (n1 - 1)) + v2**2 / (n2**2 * (n2 - 1)))
    
    # two side t-test
    p = 2 * t.cdf(-abs(tstat), df)
    
    # upper and lower bounds
    lb = delta - t.ppf(0.975,df)*pooled_se 
    ub = delta + t.ppf(0.975,df)*pooled_se
  
    return pd.DataFrame(np.array([tstat,df,p,delta,lb,ub]).reshape(1,-1),
                         columns=['T statistic','df','pvalue 2 sided','Difference in mean','lb','ub'])

In [6]:
def find_k(df, increment=0, decrement=0):
    """Find the optimum k clusters"""
    sse = {}
    
    for k in range(1, 21):
        kmeans = KMeans(n_clusters=k, random_state=1)
        kmeans.fit(df)
        sse[k] = kmeans.inertia_
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee + increment - decrement
    return k

In [57]:
def assign_label(df, k = -1):
    df_temp = df.copy()
    X_normalized =df_temp.loc[:,['totpts_stand','age_stand']]
    
    if k == -1:
        k_temp = find_k(X_normalized)
    else:
        k_temp = k
    
    kmeanModel = KMeans(n_clusters=k_temp).fit(X_normalized)
    
    df_temp['Y']=kmeanModel.labels_
    
    return df_temp

In [76]:
def cluster_sampling(df, sample_rate = 0.8):
    
    clusters = dict(df['Y'].value_counts()).keys()
    
    b = []
    w = []
    
    for i in clusters:
        d_temp = dict(df[df['Y']==i]['race'].value_counts())
        sam_size = round(min(d_temp.values())*sample_rate)
        #print(d_temp.items())
        
        w_temp = df[(df['Y']==i)&(df['race']=='WHITE')].sample(sam_size)
        b_temp = df[(df['Y']==i)&(df['race']=='BLACK')].sample(sam_size)
        
        b.append(b_temp)
        w.append(w_temp)
        
    
    b_df = pd.concat(b)
    w_df = pd.concat(w)
    
    return b_df,w_df

In [7]:
raw = pd.read_csv("sentencing_fips_pop2010 2.csv")

In [8]:
raw_clean = drop_error_value(raw)

In [9]:
df_drug_pinellas =sp_county_crime_type(county='pinellas',crime_type='drug',df= raw_clean)

In [10]:
df_drug_model = cluster_prep(df_drug_pinellas)

In [59]:
gen_male = df_drug_model.loc[df_drug_model['gender_ind']==1,:].copy()
gen_female = df_drug_model.loc[df_drug_model['gender_ind']==0,:].copy()

# Sensitivity Check - Varying number of clusters

In [74]:
diff_var = []

for i in range(1,8):
    m = assign_label(gen_male, k = i)
    b_m,w_m = cluster_sampling(m)

    f = assign_label(gen_female, k = i)
    b_f,w_f = cluster_sampling(f)
    
    treat = pd.concat([b_m,b_f])
    cont = pd.concat([w_m,w_f])
    
    temp_result = welch_ttest(treat.sp_total_days, cont.sp_total_days,'unequal')
    
    diff_var.append(temp_result)

In [75]:
# 
diff_var = pd.concat(diff_var)
diff_var

Unnamed: 0,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub
0,20.574739,19017.288214,4.761835e-93,184.31456,166.755508,201.873611
0,21.488448,17814.586643,3.831086e-101,191.395494,173.937102,208.853887
0,6.853052,20522.083501,7.434081e-12,61.427695,43.858417,78.996973
0,7.620467,20109.362136,2.639382e-14,72.178744,53.613443,90.744044
0,7.265838,20292.673333,3.841556e-13,68.086173,49.718792,86.453554
0,7.734032,19718.424673,1.091823e-14,72.550103,54.163274,90.936932
0,6.345045,20508.332753,2.270112e-10,57.062108,39.434767,74.689448


# Sensitivity Check - Varying sampling ratio

In [79]:
diff_var = []
rate = [0.4,0.6,0.7,0.8,0.9]

for i in rate:
    m = assign_label(gen_male)
    b_m,w_m = cluster_sampling(m,sample_rate = i)

    f = assign_label(gen_female)
    b_f,w_f = cluster_sampling(f, sample_rate = i)
    
    treat = pd.concat([b_m,b_f])
    cont = pd.concat([w_m,w_f])
    
    temp_result = welch_ttest(treat.sp_total_days, cont.sp_total_days,'unequal')
    
    diff_var.append(temp_result)

In [80]:
diff_var = pd.concat(diff_var)
diff_var

Unnamed: 0,T statistic,df,pvalue 2 sided,Difference in mean,lb,ub
0,5.941032,9325.662667,2.933747e-09,78.823374,52.815935,104.830812
0,5.601322,15482.616039,2.163423e-08,57.105562,37.122139,77.088985
0,7.300685,18011.205078,2.982347e-13,70.863582,51.838053,89.889112
0,7.222618,20198.331168,5.280766e-13,68.211391,49.700113,86.722669
0,7.39443,22839.682116,1.469065e-13,64.133675,47.133534,81.133816


In [81]:
df_drug_model.head()

Unnamed: 0,gender_ind,age,totpts,race,sp_total_days,totpts_stand,age_stand
7,1,19,65.4,BLACK,1825,0.718276,-0.695758
53,1,30,111.8,WHITE,1095,0.995718,-0.092443
93,1,41,113.6,WHITE,900,0.981257,0.192701
127,1,20,71.5,BLACK,0,0.798938,-0.601414
154,1,34,124.0,BLACK,5475,0.999927,0.012086


In [85]:
df_drug_model = cluster_prep(df_drug_pinellas)

In [86]:
X = df_drug_model.loc[:,['totpts_stand','age_stand','gender']]

In [87]:
import gower
distance_matrix = gower.gower_matrix(X)

In [None]:
from sklearn.cluster import DBSCAN


dbscan_cluster = DBSCAN(eps=0.3, 
                        min_samples=2, 
                        metric="precomputed")


dbscan_cluster.fit(distance_matrix)