In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from statsmodels.stats.proportion import proportions_ztest,confint_proportions_2indep

In [17]:
raw = pd.read_csv("sentencing_fips_pop2010 2.csv")

In [15]:
#crime_typed = raw[raw['statut'].str.match('893.*')== True]

# Preprocessing

In [13]:
def drop_error_value(df):
    df_temp = df.copy()
    df_temp = df_temp[~((df_temp['spyrs']>=99)|(df_temp['life']=='Y'))]
    df_temp['sp_cj_total_days'] = (df_temp['sp_total_days'])+(df_temp['cj_total_days'])
    df_clean = df_temp[~((df_temp['sp_cj_total_days']<=7) & (df_temp['totpts']>44))]
    
    return df_clean
    

In [18]:
raw_clean = drop_error_value(raw)

# Sliceing

In [19]:
def sp_county_crime_type(county, crime_type, df):
    match_ind = ''
    if crime_type == 'drug':     ### Expendable
        match_ind = '893.*'
        
    df = df[df['statut'].str.match(match_ind)== True]
    
    df =df[df['county']== county]
    
    return df
    
    

In [20]:
df_drug =sp_county_crime_type(county='pinellas',crime_type='drug',df= raw_clean)

# Processing Features

In [22]:
def feat_prep(df):
    df_temp = df.copy()
    df_temp['gender_ind']=[1 if x=='MALE' else 0 for x in df_temp['gender']]
    df_temp = pd.concat([df_temp,pd.get_dummies(df_temp['clfely'], prefix='clfely')],axis=1)
    df_temp['sent_year']=df_temp["sentdte"].str[:4]
    df_temp['lsviol']=[0 if x==0 else 1 for x in df_temp['lsviol']]
    df_temp['rpviol']=[0 if x==0 else 1 for x in df_temp['rpviol']]
    df_temp['faviol']=[0 if x==0 else 1 for x in df_temp['faviol']]
    
    return df_temp

In [23]:
df_drug_model = feat_prep(df_drug)

In [24]:
df_drug_model.shape

(42267, 51)

# Outcome Variable bucket

In [27]:
def outcome_bucket(thresh,df):
    df['Y']=0
    df.loc[df['sp_total_days']>thresh,'Y']=1
    
    return df
    

In [28]:
df_drug_model = outcome_bucket(365,df_drug_model)

In [30]:
df_drug_model.columns

Index(['Unnamed: 0', 'fstnm', 'lstnm', 'age', 'dob', 'race', 'gender',
       'county', 'totpts', 'judge', 'clfely', 'statut', 'offlvl', 'offdte',
       'sentdte', 'sp_total_days', 'spyrs', 'spmths', 'spdays',
       'cj_total_days', 'cc_total_days', 'pr_total_days', 'enhanc', 'haboff',
       'habvio', 'life', 'vcc', 'prrpact', 'dornum', 'circuit', 'dispos',
       'docket', 'prioff', 'ccimp', 'cjimp', 'spimp', 'primp', 'lsviol',
       'rpviol', 'faviol', 'fips', 'state', 'pop_2010', 'sp_cj_total_days',
       'gender_ind', 'clfely_1ST DEGREE', 'clfely_1ST/LIFE',
       'clfely_2ND DEGREE', 'clfely_3RD DEGREE', 'clfely_LIFE', 'sent_year',
       'Y'],
      dtype='object')

# Model Prep

In [39]:
def model_gen(df,feat_col, mod = 'rf'):
    X = df.loc[:,feat_col]
    y = df['Y']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
    
    if mod == 'rf':
        model = RandomForestClassifier(n_estimators=500,random_state=7)
        model.fit(X_train, y_train)
        dev_pred = model.predict(X_test)
        print('Random Forest Accuracy %.3f' %metrics.accuracy_score(y_test, dev_pred))
        print('Random Forest Precision %.3f' %metrics.precision_score(y_test, dev_pred))
        print('Random Forest Recall %.3f' %metrics.recall_score(y_test, dev_pred))
    
    df['propensity'] = model.predict_proba(X)[:,1]  
    
    return model,df

In [40]:
feat_col = ['gender_ind','clfely_1ST DEGREE','clfely_3RD DEGREE'
                               ,'clfely_LIFE','clfely_1ST/LIFE','sent_year','totpts','lsviol','rpviol','faviol']


model_rfc,df_drug_output = model_gen(df_drug_model,feat_col, mod = 'rf')

Random Forest Accuracy 0.926
Random Forest Precision 0.827
Random Forest Recall 0.759


In [41]:
## extra try without the clfely column
feat_col = ['gender_ind','sent_year','totpts','lsviol','rpviol','faviol']


model_rfc,df_drug_output = model_gen(df_drug_model,feat_col, mod = 'rf')

Random Forest Accuracy 0.925
Random Forest Precision 0.830
Random Forest Recall 0.753


# Propensity Matching

In [52]:
def propensity_matching(df, model, sample_size=2000, thresh=0.03):
    
    treated = df[df["race"]=="BLACK"].sample(sample_size)
    treated = treated.reset_index(drop=True)
    
    untreated = df[df["race"]=='WHITE']
    matched_control = []
    
    
    def add_matched_control(patient, thresh = thresh):
        control_patient =untreated[(untreated["propensity"]>=patient["propensity"]-thresh/2 )& (untreated["propensity"]<=patient["propensity"]+thresh/2 )].sample().iloc[0]
        matched_control.append(control_patient)
    
    treated.apply(add_matched_control, axis=1)
    matched_control_df = pd.DataFrame(matched_control).reset_index(drop=True)
    
    paired_sample = treated.join(matched_control_df, rsuffix="_control")
    
    return paired_sample


In [53]:
matched_df = propensity_matching(df_drug_output,model_rfc)

In [54]:
matched_df.head()

Unnamed: 0.1,Unnamed: 0,fstnm,lstnm,age,dob,race,gender,county,totpts,judge,...,sp_cj_total_days_control,gender_ind_control,clfely_1ST DEGREE_control,clfely_1ST/LIFE_control,clfely_2ND DEGREE_control,clfely_3RD DEGREE_control,clfely_LIFE_control,sent_year_control,Y_control,propensity_control
0,632171,MICHAEL,CRUMITY,46,1969-06-21,BLACK,MALE,pinellas,28.0,ANDREWS,...,0,1,0,0,0,1,0,2010,0,0.447557
1,375642,SHANEVA,BOYD,27,1981-12-29,BLACK,FEMALE,pinellas,20.2,NEWTON,...,0,0,0,0,1,0,0,2008,0,0.0
2,1222661,KEONTAE,PEDIGO,19,1990-08-24,BLACK,MALE,pinellas,32.0,QUESADA,...,0,0,0,0,0,1,0,2013,0,0.006
3,578703,ROSA,TAYLOR,41,1964-12-31,BLACK,FEMALE,pinellas,39.8,BAIRD,...,366,1,0,0,0,1,0,2008,1,0.540895
4,464148,KENDRA,DAVIS,19,1987-01-08,BLACK,FEMALE,pinellas,38.4,"BAIRD, W.",...,0,1,0,0,0,1,0,2007,0,0.0


In [55]:
matched_df.columns

Index(['Unnamed: 0', 'fstnm', 'lstnm', 'age', 'dob', 'race', 'gender',
       'county', 'totpts', 'judge',
       ...
       'sp_cj_total_days_control', 'gender_ind_control',
       'clfely_1ST DEGREE_control', 'clfely_1ST/LIFE_control',
       'clfely_2ND DEGREE_control', 'clfely_3RD DEGREE_control',
       'clfely_LIFE_control', 'sent_year_control', 'Y_control',
       'propensity_control'],
      dtype='object', length=106)

# Statistical evaluation

In [56]:
def get_sigfig(df,sample_size=2000):
    treat_pos = sum(df['Y'])
    control_pos = sum(df['Y_control'])
    
    count = np.array([treat_pos, control_pos])
    nobs = np.array([sample_size, sample_size])
    stat, pval = proportions_ztest(count, nobs,alternative='two-sided')
    print('p-value: {0:0.3f}'.format(pval))
    
    lower, upper = confint_proportions_2indep(treat_pos, sample_size, control_pos, sample_size, method=None, compare='diff', alpha=0.05, correction=True)
    print('confidence interval',(lower, upper))
    
    sigfig = {'pvalue':pval,'lower':lower,'upper':upper}
    
    return sigfig

In [57]:
result = get_sigfig(matched_df)
result

p-value: 0.749
confidence interval (-0.023048062740814564, 0.03203908680329136)


{'pvalue': 0.7489189615465779,
 'lower': -0.023048062740814564,
 'upper': 0.03203908680329136}