# Identifying the Factors that Influence Pre-Trial Release of Felony Defendants

In [111]:
%matplotlib inline

In [112]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

### Gather Harris County felony defendant records

In [113]:
df = pd.read_csv('https://raw.github.com/natethedrummer/pretrial-release/master/felony_offenses.csv')

In [114]:
df

Unnamed: 0,ref,RECIDIVISM Y/N (POST DISP. OUTCOME),PRETRIAL MISCONDUCT Y/N,SPN 2,DISPOSITION SEVERITY SCORE,DATE FILED,DATE DISPOSED,SPN,HCJ DATE BOOKED,DATE INITIAL RELEASED,...,V,priors,LABEL DETAINED DAYS,LABEL 2 DETAINED DAYS,HCJ Booked,access,f_priors,m_priors,hired_attorney,poc
0,1349,N,N,10267,5,2/19/12,4/3/12,10267,2/19/12,4/3/12,...,44,1,31-60 DAYS,3,Booked,0,1,1,0,1
1,2616,N,N,40834,5,4/6/12,6/12/12,40834,4/11/12,6/12/12,...,62,1,61-90 DAYS,4,Booked,0,1,1,0,1
2,2673,N,N,43872,6,4/9/12,5/1/13,43872,6/26/12,5/1/13,...,309,1,121-365 DAYS,6,Booked,0,1,1,0,1
3,362,N,N,48182,6,1/15/12,2/16/12,48182,1/15/12,2/16/12,...,32,1,31-60 DAYS,3,Booked,0,1,1,1,1
4,531,N,N,48245,2,1/21/12,5/10/12,48245,4/20/12,5/10/12,...,20,1,11-30 DAYS,2,Booked,0,1,1,0,0
5,1647,N,N,50295,4,3/1/12,3/5/12,50295,3/2/12,3/5/12,...,3,1,0-10 DAYS,1,Booked,0,1,1,0,1
6,2621,N,N,50302,6,4/7/12,6/7/12,50302,4/7/12,6/7/12,...,61,1,61-90 DAYS,4,Booked,0,1,1,0,0
7,511,N,N,55175,6,1/21/12,8/16/13,55175,1/21/12,8/16/13,...,573,1,>1YRS<2YRS,0,Booked,0,1,1,1,1
8,1550,N,N,75989,4,2/27/12,4/23/12,75989,2/28/12,4/23/12,...,55,1,31-60 DAYS,3,Booked,0,1,1,1,0
9,1233,N,N,89246,6,2/14/12,4/20/12,89246,2/15/12,4/20/12,...,65,1,61-90 DAYS,4,Booked,0,1,1,1,1


### Include disposed cases only

In [115]:
df = df[df['CASE DISPOSED STATUS'] == 'DISPOSED']

In [116]:
df = df[['access',
        'BOND $',
        'felony priors',
        'Misd priors',
        'OffenseDescription',
        'Offense',
        'OffenseClass',
        'hired_attorney',
        'gender',
        'race',
        'age']]

In [117]:
series_offense = pd.Series({'ARSON': 'ARSON',
                          'SALE DRUG': 'DRUG',
                          'POSS DRUG': 'DRUG',
                          'FEL DWI': 'DWI',
                          'KIDNAPPING': 'KIDNAPPING',
                          'CAP MURDER': 'MURDER',
                          'CAPITAL MURDER': 'MURDER',
                          'ASLT-MURDR': 'MURDER',
                          'MURD/MANSL': 'MURDER',
                          'MURDER': 'MURDER',
                          'ROBBERY': 'ROBBERY',
                          'THEFT': 'ROBBERY',
                          'BURGLARY': 'ROBBERY',
                          'burglary': 'ROBBERY',
                          'AUTO THEFT': 'ROBBERY',
                          'RAPE': 'SEX ABUSE',
                          'SEX ABUSE': 'SEX ABUSE',
                          'OTHER FEL': 'OTHER',
                          'OTHERMISD': 'OTHER'})
df['offense_bin'] = df['Offense'].map(series_offense)

df['FC'] = np.where(df['OffenseClass']=='FC', 1, 0)    
df['F1'] = np.where(df['OffenseClass']=='F1', 1, 0)    
df['F2'] = np.where(df['OffenseClass']=='F2', 1, 0)    
df['F3'] = np.where(df['OffenseClass']=='F3', 1, 0)    
df['FS'] = np.where(df['OffenseClass']=='FS', 1, 0)    

df.drop('OffenseClass', axis=1, inplace=True)

df.rename(columns={'age': 'Age'}, inplace=True)

df.rename(columns={'access': 'Made Bail'}, inplace=True)

df.rename(columns={'Misd priors': 'Prior Misdemeanor Count'}, inplace=True)
df['Prior Misdemeanor'] = np.where(df['Prior Misdemeanor Count']>=1, 1, 0)
df.rename(columns={'felony priors': 'Prior Felony Count'}, inplace=True)
df['Prior Felony'] = np.where(df['Prior Felony Count']>=1, 1, 0)

df[~(df['BOND $'] == 'NO BOND')]    
df['Bond Amount'] = (df[~(df['BOND $'] == 'NO BOND')])['BOND $'].astype(float)

series = pd.Series({'DWI': 1})
df['DWI'] = df['offense_bin'].map(series)
df['DWI'].fillna(value=0, inplace=True)

df['Offense Against Family'] = df['OffenseDescription'].str.contains('fam|chil|kid', case=False, na=False)
df['Offense Against Family'] = df['Offense Against Family'].astype(int)

df.rename(columns={'hired_attorney': 'Private Attorney'}, inplace=True)

series = pd.Series({'M': 1})
df['Male'] = df['gender'].map(series)
df['Male'].fillna(value=0, inplace=True)

series = pd.Series({'BLACK': 1})
df['Black'] = df['race'].map(series)
df['Black'].fillna(value=0, inplace=True)
df.groupby('race').count()

series = pd.Series({'HISPANIC': 1})
df['Hispanic'] = df['race'].map(series)
df['Hispanic'].fillna(value=0, inplace=True)

In [118]:
df_desc = df.describe()
df_desc = df_desc.ix[['count','mean']]
df_desc = df_desc.transpose()
df_desc.index.name = 'Variable'
df_desc.rename(columns={'count': 'Sample Size',
    'mean': 'Mean Value'
    }, inplace=True)
df_desc = df_desc.sort_index()
df_desc['Sample Size'] = df_desc['Sample Size'].astype(int).apply(lambda x: '{:,}'.format(x))
df_desc['Mean Value'] = (df_desc['Mean Value'] * 100).astype(int).astype(str) + '%'

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


# Describe Defendants

# Describe Defendants: Made Bail

Only 40% of defendants made bail and were released before trial.

In [119]:
df_desc.loc[['Made Bail']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Made Bail,3189,40%


# Describe Defendants: Legal Representation

Only 35% of defendants hired a private attorney.

In [120]:
df_desc.loc[['Private Attorney']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Private Attorney,3189,35%


# Describe Defendants: Demographics

The majority of defendants (72%) were people of color.  
48% of defendants were Black and 24% were Hispanic.

In [121]:
df_desc.loc[['Black', 'Hispanic']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Black,3189,48%
Hispanic,3189,24%


83% of defendants were Male.

In [122]:
df_desc.loc[['Male']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3189,83%


# Describe Defendants: Felony Class

The most frequent (45%) felony charge was a State Jail Felony (FS).  
Less than 1% of defendants were charged with a Capital Felony (FC).

In [123]:
df_desc.loc[['F1', 'F2', 'F3', 'FC', 'FS']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
F1,3189,10%
F2,3189,18%
F3,3189,24%
FC,3189,0%
FS,3189,45%


# Describe Defendants: Prior Charges

53% of defendants had a prior felony charge and 65% had a prior misdemeanor charge.

In [124]:
df_desc.loc[['Prior Felony', 'Prior Misdemeanor']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Prior Felony,3189,53%
Prior Misdemeanor,3189,65%


In [125]:
df_desc['Mean Value'] = df_desc['Mean Value'].apply(lambda x: float(x.strip('%'))/100)

On average, a defendant had previously been charged with:   
* 1.9 felonies  
* 2.3 misdemeanors

In [126]:
df_desc.loc[['Prior Felony Count', 'Prior Misdemeanor Count']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Prior Felony Count,3189,1.9
Prior Misdemeanor Count,3189,2.28


In [127]:
df_desc['Mean Value'] = df_desc['Mean Value'].apply(lambda x: '${:,.0f}'.format(x))

# Describe Defendants: Bond Amount

On average, a defendant's Bond Amount was set at $22,369.

In [128]:
df_desc.loc[['Bond Amount']]

Unnamed: 0_level_0,Sample Size,Mean Value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Bond Amount,2729,"$22,369"


# Identify Factors that Influence Bond Amount

In [130]:
# create frames list
frames = []

# select features    
df = df[['BOND $',
           'gender',
            'race',
            'Age']]
# bail amount    
df[~(df['BOND $'] == 'NO BOND')]    
df['Bail Amount'] = (df[~(df['BOND $'] == 'NO BOND')])['BOND $'].astype(float)
df.drop('BOND $', axis=1, inplace=True)

# rename gender to sex
df.rename(columns={'gender': 'sex'}, inplace=True)

# get average bail amount by sex and age
bins = [0, 20, 30, 45, 100]

group_names = ['Under 20',
'Twenties',
'30 to 45',
'Over 45']

df = df[df['Age'] != '#VALUE!']

df['Age'] = df['Age'].astype(float)

df['age category'] = pd.cut(df['Age'], bins, labels=group_names)

df_frame = df['Bail Amount'].groupby([df['age category'], df['sex']]).describe()

frames.append(df_frame)

# create demographics list
demographics = ['sex','race']

# get average bail amount by demographics
for d in demographics:

    df_frame = df['Bail Amount'].groupby(df[d]).describe()

    frames.append(df_frame)

In [131]:
df_frame.head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BLACK,1218.0,20556.239737,38435.08114,0.0,5000.0,15000.0,20000.0,500000.0
HISPANIC,606.0,31929.042904,97177.837735,2000.0,5000.0,15000.0,35000.0,2000000.0
OTHER,27.0,18333.333333,22927.98089,2000.0,3000.0,10000.0,30000.0,100000.0
WHITE,640.0,21005.0,47062.76773,1000.0,5000.0,10000.0,20000.0,800000.0


In [None]:
i=0

for f in frames:

    df = f

    # include count and mean only
    df = df[['count','mean']]

    # rename stats
    df.rename(columns={'count': 'N',
        'mean': 'Mean'
        }, inplace=True)

    # output to excel
    df.to_csv('bail_by_demographics_' + str(i) + '.csv')

    i+=1

### outline
* ols results of natural log of bond amount
* model 1: felony class, family, dwi, priors (yes/no)
* model 2: felony class, family, dwi, priors (yes/no), privateatt, black, hispanic, female, age
* coefficient, standard error, sig at 0.10, 0.05, or 0.01
* count
* F-stat w/ p-value
* r-squared and adj r-squared
* estimate coefficients and odds ratio of logit equation: probability of bail
* estimated probability of bail for selected defendant types

In [None]:
# select features
df_release = df_offenses[['SPN',
                    'access',
                    'priors',
                    'f_priors',
                    'm_priors',
                    'hired_attorney',
                    'poc',
                    'gender',
                    'offense_bin']]

In [None]:
# check for multicollinearity 
df_corr = df_release[['priors', 'hired_attorney', 'poc', 'gender', 'offense_bin']].corr()

for col in df_corr.columns.values:

    df_corr[col] = round(df_corr[col],2)

    if (df_corr[col].max() < 1) & (df_corr[col].max() > 0.5):
        print("Multicollinearity Test: Fail")
        print(df_corr)
        break
    else:
        pass        

plot_corr = sns.heatmap(df_corr, 
            xticklabels=df_corr.columns.values,
            yticklabels=df_corr.columns.values)

fig_corr = plot_corr.get_figure()

fig_corr.savefig("plot_corr.png")
    
plt.close()

In [None]:
# specify regression formula
y, X = dmatrices('access ~ priors + hired_attorney + poc + gender + offense_bin',
                  df_release, 
                  return_type="dataframe")
    
# flatten y into a 1-D array for scikit-learn
y_ravel = np.ravel(y)

# split into train and validate
X_train, X_test, y_train, y_test = train_test_split(X, y_ravel, 
                                                    test_size=0.3, 
                                                    random_state=0)    

# estimate coefficients
model = LogisticRegression(solver='newton-cg', multi_class='multinomial')

model.fit(X_train, y_train)

In [None]:
# report feature importance
df_coef = coef(model, X, X_train, y_train)

plot_coef = sns.barplot(x="feature", y="probability", data=df_coef)

fig_coef = plot_coef.get_figure()

fig_coef.savefig("plot_coef.png")

plt.close()

df_coef.to_csv('coef.csv')

In [None]:
# report model accuracy
df_accuracy = accuracy(model, X_test, y_test)
df_accuracy.to_csv('accuracy.csv') 

In [None]:
# report predictions 
df_pred = pred(model, X, y, df_offenses)
df_pred.to_csv('pred.csv')