# <center>Identifying Factors that Influence Bond Amount, Pre-Trial Status at Disposition, and Disposition Outcome</center>  
## <center>A cross-sectional statistical analysis of Harris County felony defendant records</center>

In [1]:
# import packages
import numpy as np
import pandas as pd

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
import statsmodels.formula.api as smf

%matplotlib inline

# read felony records dataset
df = pd.read_csv('https://raw.github.com/natethedrummer/pretrial-release/master/felony_offenses.csv')

# convert SPN from number to string
df['SPN'] = df['SPN'].astype(str)

# bin offense
series_offense = pd.Series({'ARSON': 'ARSON',
                          'SALE DRUG': 'DRUG',
                          'POSS DRUG': 'DRUG',
                          'FEL DWI': 'DWI',
                          'KIDNAPPING': 'KIDNAPPING',
                          'CAP MURDER': 'MURDER',
                          'CAPITAL MURDER': 'MURDER',
                          'ASLT-MURDR': 'MURDER',
                          'MURD/MANSL': 'MURDER',
                          'MURDER': 'MURDER',
                          'ROBBERY': 'ROBBERY',
                          'THEFT': 'ROBBERY',
                          'BURGLARY': 'ROBBERY',
                          'burglary': 'ROBBERY',
                          'AUTO THEFT': 'ROBBERY',
                          'RAPE': 'SEX ABUSE',
                          'SEX ABUSE': 'SEX ABUSE',
                          'OTHER FEL': 'OTHER',
                          'OTHERMISD': 'OTHER'})
df['offense_bin'] = df['Offense'].map(series_offense)

# binary offense variables
offense_list = df['offense_bin'].unique().tolist()
for offense in offense_list:
    series = pd.Series({offense: 1})
    df[offense] = df['offense_bin'].map(series)
    df[offense].fillna(value=0, inplace=True)

# Felony Class Offense
df['FC'] = np.where(df['OffenseClass']=='FC', 1, 0)    
df['F1'] = np.where(df['OffenseClass']=='F1', 1, 0)    
df['F2'] = np.where(df['OffenseClass']=='F2', 1, 0)    
df['F3'] = np.where(df['OffenseClass']=='F3', 1, 0)    
df['FS'] = np.where(df['OffenseClass']=='FS', 1, 0)    

# priors
df.rename(columns={'Misd priors': 'Prior Misdemeanor Count'}, inplace=True)
df['Prior Misdemeanor'] = np.where(df['Prior Misdemeanor Count']>=1, 1, 0)
df.rename(columns={'felony priors': 'Prior Felony Count'}, inplace=True)
df['Prior Felony'] = np.where(df['Prior Felony Count']>=1, 1, 0)

# dwi
series = pd.Series({'DWI': 1})
df['DWI'] = df['offense_bin'].map(series)
df['DWI'].fillna(value=0, inplace=True)

# family offense
df['Offense Against Family'] = df['OffenseDescription'].str.contains('fam|chil|kid', case=False, na=False)
df['Offense Against Family'] = df['Offense Against Family'].astype(int)

# race
series = pd.Series({'BLACK': 1})
df['Black'] = df['race'].map(series)
df['Black'].fillna(value=0, inplace=True)
series = pd.Series({'HISPANIC': 1})
df['Hispanic'] = df['race'].map(series)
df['Hispanic'].fillna(value=0, inplace=True)
series = pd.Series({'WHITE': 1})
df['White'] = df['race'].map(series)
df['White'].fillna(value=0, inplace=True)

# sex
series = pd.Series({'F': 1})
df['Female'] = df['gender'].map(series)
df['Female'].fillna(value=0, inplace=True)
series = pd.Series({'M': 1})
df['Male'] = df['gender'].map(series)
df['Male'].fillna(value=0, inplace=True)

# pretrial status at disposition
sbc = pd.Series({'DETAINED': 'Detained',
                 'ON BOND': 'On Bond',
                 'NON -ARREST - NOT BOOKED - DEFERRED': 'Deferred',
                 'NON-ARREST - NOT BOOKED - DISMISSED': 'Dismissed',
                 'NON ARREST - NOT BOOKED - NO BILL': 'No Bill'})
df['PRETRIAL STATUS AT DISPOSITION'] = df['PRETRIAL STATUS AT DISPOSITION'].map(sbc)

# on bond at disposition 
df['On_Bond'] = np.where(df['PRETRIAL STATUS AT DISPOSITION']=='On Bond', 1, 0)    

# detained at disposition 
df['Detained'] = np.where(df['PRETRIAL STATUS AT DISPOSITION']=='Detained', 1, 0)    
 
# bond category
sps = pd.Series({'2000 or less': '$2,000 or less',
                 '2001-5000': '$2,001-$5,000',
                 '5001-10000': '$5,001-$10,000',
                 '10001-20000': '$10,001-$20,000',
                 '> 20000': 'Greater than $20,000',
                 'NO BOND': 'No Bond'})
df['BOND CAT'] = df['BOND CAT'].map(sps)
df['BOND CAT'] = pd.Categorical(df['BOND CAT'], ['$2,000 or less',
                                                    '$2,001-$5,000',
                                                    '$5,001-$10,000',
                                                    '$10,001-$20,000',
                                                    'Greater than $20,000',
                                                    'No Bond'])

# counsel type
df['Hired Attorney'] = np.where(df['counsel_type']=='Hired Attorney', 1, 0)    
df['Appointed Attorney'] = np.where(df['counsel_type']=='Appointed Attorney', 1, 0)    
df['Public Defender'] = np.where(df['counsel_type']=='Public Defender', 1, 0)    
df['Unknown Counsel'] = np.where(df['counsel_type']=='Other/Unknown', 1, 0)    

# bond type
df = df.rename(columns={'bail type made simple': 'Bail Type'})
df['PTR'] = np.where(df['Bail Type']=='PTR', 1, 0)    
df['Surety'] = np.where(df['Bail Type']=='SURETY', 1, 0)    
df['Cash'] = np.where(df['Bail Type']=='CASH', 1, 0)    
df['None'] = np.where(df['Bail Type']=='NONE', 1, 0)  

# age
df['Age'] = df['age'].replace(to_replace='#VALUE!', value='0').astype(float)
df['Age'] = df['Age'].replace(to_replace=0, value=np.nan)

# judgement
sjc = pd.Series({'ACQUITTAL insane': 'Acquitted',
                 'JURY ACQUITTAL': 'Acquitted',
                 'AQUITTAL': 'Acquitted',
                 'NOT GUILTY BY JURY VERDICT': 'Acquitted',
                 'CONV BY JURY VERDICT': 'Convicted',
                 'DEFENDANT CONVICTED ON ANOTHER CHARGE': 'Convicted',
                 'GUILTY  JURY verdict': 'Convicted',
                 'GUILTY JURY VERDICT': 'Convicted',
                 'GUILTY PLEA - NO JURY': 'Convicted',
                 'plead guilty': 'Convicted',
                 'JURY VERDCT': 'Convicted',
                 'JURY VERDICT': 'Convicted',
                 'JURY verdict': 'Convicted',
                 'jury verdict': 'Convicted',
                 'nolo contendre': 'Convicted',
                 'nolo contendre no jury': 'Convicted',
                 'DEFERRED ADJUDICATION': 'Deferred',
                 'DEFERRED ADJUDICATION ': 'Deferred',
                 'pretrial diversion': 'Deferred',
                 'DISMISSED': 'Dismissed',
                 'SURETYPTRc ': 'Dismissed',
                 'NO BILL': 'No Bill'})
df['JUDGEMENT'] = df['JUDGEMENT'].map(sjc)

# convicted 
df['Convicted'] = np.where(df['JUDGEMENT']=='Convicted', 1, 0)    

# Data Source for Statistical Analysis 
The records of 3,189 Harris County felony defendants were made available.  Some of these 3,189 defendants were excluded from analysis.  
* 244 defendants were excluded from analysis because they were not booked.  
* An additional 455 defendants were excluded from analysis because they were denied bail.  
* The remaining 2,490 defendants were included in analysis.  

In [2]:
dft = df.groupby(['HCJ Booked', 'BOND CAT'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                             'HCJ Booked': 'Booked Status',
                                                                            'BOND CAT': 'Bond Category'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Booked Status,Bond Category,N Defendants,% Defendants
0,Booked,"$2,000 or less",342,11%
1,Booked,"$2,001-$5,000",409,13%
2,Booked,"$5,001-$10,000",409,13%
3,Booked,"$10,001-$20,000",704,22%
4,Booked,"Greater than $20,000",626,20%
5,Booked,No Bond,455,14%
6,Not Booked,"$2,000 or less",87,3%
7,Not Booked,"$2,001-$5,000",61,2%
8,Not Booked,"$5,001-$10,000",37,1%
9,Not Booked,"$10,001-$20,000",28,1%


In [3]:
df = df.loc[df['HCJ Booked'] == 'Booked']
df = df.loc[df['BOND CAT'] != 'No Bond']

# Descriptive Statistics of Defendants

## Bond Amount  
The median bond amount was $15,000 and the average was $23,424.  14% of defendants had a bond amount of $2,000 or less.  

In [4]:
dft = df.copy(deep=True)
dft['Bond Amount'] = dft['BOND $'].astype(float)
dft = dft['Bond Amount'].describe().reset_index()
dft = dft.rename(columns={'index': 'Statistic'})
dft['Bond Amount'] = dft['Bond Amount'].apply(lambda x: '${:,.0f}'.format(x))
dft = dft.loc[dft['Statistic'] != 'count']
dft

Unnamed: 0,Statistic,Bond Amount
1,mean,"$23,424"
2,std,"$60,125"
3,min,"$1,000"
4,25%,"$5,000"
5,50%,"$15,000"
6,75%,"$24,250"
7,max,"$2,000,000"


In [5]:
dft = df.copy(deep=True)
dft = dft.groupby(['BOND CAT'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                      'BOND CAT': 'Bond Category'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Bond Category,N Defendants,% Defendants
0,"$2,000 or less",342,14%
1,"$2,001-$5,000",409,16%
2,"$5,001-$10,000",409,16%
3,"$10,001-$20,000",704,28%
4,"Greater than $20,000",626,25%
5,No Bond,0,0%


## Pre-Trial Status at Disposition  
Of the 2,490 defendants, only 898 (36%) were on bond at disposition. 

In [6]:
dft = df.copy(deep=True)
dft = dft.groupby(['PRETRIAL STATUS AT DISPOSITION'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                            'PRETRIAL STATUS AT DISPOSITION': 'Pre-Trial Status at Disposition'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Pre-Trial Status at Disposition,N Defendants,% Defendants
0,Detained,1592,64%
1,On Bond,898,36%


## Disposition Outcome  
2 out of 2,490 defendants had no judgment information.  Out of 2,488 defendants, 1,069 (43%) were not convicted. 

In [7]:
dft = df.copy(deep=True)
dft = dft.groupby(['JUDGEMENT'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                            'JUDGEMENT': 'Judgement'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Judgement,N Defendants,% Defendants
0,Acquitted,4,0%
1,Convicted,1419,57%
2,Deferred,610,25%
3,Dismissed,382,15%
4,No Bill,73,3%


## Bail Type  
Of the 2,490 defendants, 1,427 (57%) never posted bail.  

In [8]:
dft = df.copy(deep=True)
dft = dft.groupby(['Bail Type'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Bail Type,N Defendants,% Defendants
0,CASH,4,0%
1,NONE,1427,57%
2,PTR,44,2%
3,SURETY,1015,41%


## Counsel Type  
Of the 2,490 defendants, only 848 (34%) hired an attorney.  

In [9]:
dft = df.copy(deep=True)
dft = dft.groupby(['counsel_type'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                                'counsel_type': 'Counsel Type'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Counsel Type,N Defendants,% Defendants
0,Appointed Attorney,1445,58%
1,Hired Attorney,848,34%
2,Other/Unknown,58,2%
3,Public Defender,139,6%


## Race  
Of the 2,490 defendants, 1,850 (74%) were not White.  

In [10]:
dft = df.copy(deep=True)
dft = dft.groupby(['race'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                                'race': 'Race'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Race,N Defendants,% Defendants
0,BLACK,1217,49%
1,HISPANIC,606,24%
2,OTHER,27,1%
3,WHITE,640,26%


## Gender  
Of the 2,490 defendants, 2,053 (82%) were Male.  

In [11]:
dft = df.copy(deep=True)
dft = dft.groupby(['gender'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                                'gender': 'Gender'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Gender,N Defendants,% Defendants
0,F,437,18%
1,M,2053,82%


## Age  
Of the 2,490 defendants, the median age was 30 and the average was 32.

In [12]:
dft = df.copy(deep=True)
dft = dft['Age'].describe().reset_index()
dft = dft.rename(columns={'index': 'Statistic'})
dft['Age'] = dft['Age'].astype(int)
dft = dft.loc[dft['Statistic'] != 'count']
dft

Unnamed: 0,Statistic,Age
1,mean,32
2,std,11
3,min,17
4,25%,24
5,50%,30
6,75%,40
7,max,82


## Offense Class  
6 of the 2,490 defendants had no offense class information.  1 defendant was charged with capital murder and was not denied bail.  Out of the 2,484 defendants, 1,234 (50%) were charged with a state level felony.  
Going forward, the analysis will exclude the 6 defendants with no offense class information, the 15 defendants with offense class F, and the single defendant who was charged with capital murder yet was not denied bail.  The resulting sample size is 2,468.

In [13]:
dft = df.copy(deep=True)
dft = dft.groupby(['OffenseClass'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants',
                                                                                'OffenseClass': 'Offense Class'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Offense Class,N Defendants,% Defendants
0,F,15,1%
1,F1,239,10%
2,F2,446,18%
3,F3,549,22%
4,FC,1,0%
5,FS,1234,50%


In [14]:
df = df.loc[df['OffenseClass'] != 'F']
df = df.loc[df['OffenseClass'] != 'FC']
df = df.dropna(subset=['OffenseClass'])

## Prior Charges 
Out of the 2,468 defendants, 666 (27%) had no prior felony or misdemeanor charges.  

In [15]:
dft = df.copy(deep=True)
dft = dft.groupby(['Prior Felony', 'Prior Misdemeanor'])['SPN'].count().reset_index().rename(columns={'SPN': 'N Defendants'})
dft['% Defendants'] = round((dft['N Defendants'] / dft['N Defendants'].sum()) * 100).astype(int).astype(str) + '%'
dft['N Defendants'] = dft['N Defendants'].map('{:,.0f}'.format)
dft

Unnamed: 0,Prior Felony,Prior Misdemeanor,N Defendants,% Defendants
0,0,0,666,27%
1,0,1,497,20%
2,1,0,183,7%
3,1,1,1122,45%


# Demographics, Offense Class, and Prior Charges Influence Bond Amount  
A linear regression model was used to identify factors that have a statistically significant relationship with bond amount.  Bond amount was transformed using the natural logarithm function.  
Hispanics, men, and older folks tend to get set a higher bond amount regardless of the offense class and prior charges.  
Hiring a private attorney does not significantly influence bond amount

In [16]:
df_bond = df.copy(deep=True)
df_bond['Bond Amount'] = df_bond['BOND $'].astype(float)
df_bond['Bond_Amount_ln'] = np.log(df_bond['Bond Amount'])
df_bond = df_bond.rename(columns={'Prior Felony': 'Prior_Felony',
                                 'Hired Attorney': 'Hired_Attorney',
                                 'Appointed Attorney': 'Appointed_Attorney'})
results = smf.ols('Bond_Amount_ln ~ F1 + F2 + F3 + Prior_Felony + Hispanic + Male + Age + Hired_Attorney', data=df_bond).fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.394
Dependent Variable:,Bond_Amount_ln,AIC:,6142.6314
Date:,2018-04-22 23:25,BIC:,6194.9318
No. Observations:,2468,Log-Likelihood:,-3062.3
Df Model:,8,F-statistic:,201.2
Df Residuals:,2459,Prob (F-statistic):,2.4499999999999999e-262
R-squared:,0.396,Scale:,0.70285

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,7.9732,0.0688,115.8603,0.0000,7.8383,8.1081
F1,1.8775,0.0611,30.7155,0.0000,1.7576,1.9974
F2,1.1912,0.0479,24.8689,0.0000,1.0973,1.2852
F3,0.3914,0.0441,8.8683,0.0000,0.3048,0.4779
Prior_Felony,0.5482,0.0364,15.0648,0.0000,0.4768,0.6195
Hispanic,0.1690,0.0403,4.1986,0.0000,0.0901,0.2480
Male,0.2087,0.0452,4.6135,0.0000,0.1200,0.2974
Age,0.0129,0.0016,8.2722,0.0000,0.0098,0.0159
Hired_Attorney,0.0193,0.0374,0.5154,0.6063,-0.0541,0.0926

0,1,2,3
Omnibus:,201.701,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,307.902
Skew:,0.631,Prob(JB):,0.0
Kurtosis:,4.185,Condition No.:,160.0


# Bond Amount and Counsel Type Influence Pre-Trial Status at Disposition  
A binary logistic regression model was used to identify factors that have a statistically significant relationship with being detained at disposition.  
Hiring a private attorney significantly reduces the likelihood of being detained at disposition, regardless of bond amount.

In [17]:
df_ptr = df.copy(deep=True)
df_ptr['Bond Amount'] = df_ptr['BOND $'].astype(float)
df_ptr['Bond_Amount_ln'] = np.log(df_ptr['Bond Amount'])
df_ptr = df_ptr.rename(columns={'Prior Felony': 'Prior_Felony',
                                 'Hired Attorney': 'Hired_Attorney',
                                 'Appointed Attorney': 'Appointed_Attorney'})
results = smf.logit('Detained ~ Bond_Amount_ln + Hired_Attorney', data=df_ptr).fit(full_output=False, disp=False)
results.summary2()

0,1,2,3
Model:,Logit,AIC:,2301.783
Dependent Variable:,Detained,BIC:,2319.2165
Date:,2018-04-22 23:25,Log-Likelihood:,-1147.9
No. Observations:,2468,LL-Null:,-1612.9
Df Model:,2,LLR p-value:,1.0818e-202
Df Residuals:,2465,Scale:,1.0
Pseudo R-squared:,0.288,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-3.4061,0.4608,-7.3918,0.0000,-4.3093,-2.5030
Bond_Amount_ln,0.5490,0.0507,10.8263,0.0000,0.4496,0.6484
Hired_Attorney,-2.9410,0.1143,-25.7238,0.0000,-3.1651,-2.7169


# Pre-Trial Status Influences Disposition Outcome  
A binary logistic regression model was used to identify factors that have a statistically significant relationship with a conviction disposition outcome.  
Being detained at disposition significantly increases the likelihood of being convicted, regardless of prior charges.

In [18]:
df_dispo = df.copy(deep=True)
df_dispo['Bond Amount'] = df_dispo['BOND $'].astype(float)
df_dispo['Bond_Amount_ln'] = np.log(df_dispo['Bond Amount'])
df_dispo = df_dispo.rename(columns={'Prior Felony': 'Prior_Felony',
                                 'Hired Attorney': 'Hired_Attorney',
                                 'Appointed Attorney': 'Appointed_Attorney'})
results = smf.logit('Convicted ~ Detained + Prior_Felony', data=df_dispo).fit(full_output=False, disp=False)
results.summary2()

0,1,2,3
Model:,Logit,AIC:,2817.4463
Dependent Variable:,Convicted,BIC:,2834.8798
Date:,2018-04-22 23:25,Log-Likelihood:,-1405.7
No. Observations:,2468,LL-Null:,-1687.2
Df Model:,2,LLR p-value:,5.7448000000000005e-123
Df Residuals:,2465,Scale:,1.0
Pseudo R-squared:,0.167,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-1.2822,0.0866,-14.8047,0.0000,-1.4519,-1.1124
Detained,1.3143,0.0949,13.8541,0.0000,1.1284,1.5003
Prior_Felony,1.4601,0.0918,15.8978,0.0000,1.2801,1.6401
