# import packages

In [1]:
%matplotlib inline 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import ttest_ind

# import bond data provided by Harris County

In [2]:
path = './fworrrevisedwheeler'

file_list = ['/DATA - Wheeler (2017 approvals - felony).xlsx',
'/DATA - Wheeler (2018 approvals - felony).xlsx']
df_list = []
for file in file_list: 
    for i in range(5):
        df_iter = pd.read_excel(path+file, sheet_name=i)
        df_list.append(df_iter)
    approvals = pd.concat(df_list)
    approvals['CaseNumber'] = approvals['CaseNumber'].astype(str)

file_list = ['/DATA - Wheeler (2017 forfeitures - felony).xlsx',
'/DATA - Wheeler (2018 forfeitures - felony).xlsx']
df_list = []
for file in file_list: 
    for i in range(5):
        df_iter = pd.read_excel(path+file, sheet_name=i)
        df_list.append(df_iter)
        forfeitures = pd.concat(df_list)
    forfeitures['CaseNumber'] = forfeitures['CaseNumber'].str[:-1]
    forfeitures['CaseNumber'] = forfeitures['CaseNumber'] + '0'
    forfeitures['ForfDate'] = forfeitures['ForfDate'].astype(int)
    forfeitures['ForfDate'] = pd.to_datetime(forfeitures['ForfDate'], format='%Y%m%d')


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




# determine who forfeitted

In [3]:
df = approvals.merge(forfeitures, on='CaseNumber', how='left', suffixes=('_app', '_for'))

df = df.dropna(subset=['BondAmtMade'])

df['forfeit'] = np.where(df['ForfDate'] > df['DateApproved'], 1, 0)

# 2017 only

In [4]:
df = df.loc[df['DateApproved'].dt.year==2017]

# bin BondAmtMade

In [5]:
df['BondAmtMade'].describe().round(0)

count     23311.0
mean      15212.0
std       21955.0
min         150.0
25%        2500.0
50%       10000.0
75%       20000.0
max      500000.0
Name: BondAmtMade, dtype: float64

In [6]:
df['bond_group'] = pd.cut(df['BondAmtMade'], 
       bins=[0, 2000, 2001, 5000, 10000, 20000, 20001, 500001], 
       right=False, 
       labels=['< $2,000',
              '$2,000',
              '$2,001-$4,999',
              '$5,000-$9,999',
              '$10,000-$19,999',
              '$20,000',
              '> $20,000'])

# calculate bond amount summary statistics by bond type, bond amount binned, and forfeiture status

In [9]:
df_mean = df.groupby(['CodeDesc', 'bond_group', 'forfeit']).agg({'BondAmtMade': 'mean'}).reset_index()

df_mean = df_mean.pivot_table(index=['CodeDesc', 'bond_group'], columns='forfeit', values='BondAmtMade')

df_mean.columns=['No Forfeit', 'Forfeit']

# test whether bond amounts are significantly different

In [10]:
df_list = []
for i in df['CodeDesc'].unique().tolist():
    df_code = df.copy(deep=True)
    df_code = df_code.loc[df_code['CodeDesc']==i]
    for x in df_code['bond_group'].unique().tolist():
        df_bin = df_code.copy(deep=True)
        df_bin = df_bin.loc[df_bin['bond_group']==x]
        bond_forf = df_bin.loc[(df_bin['CodeDesc']==i) & (df_bin['forfeit']==1)]['BondAmtMade']
        bond_no_forf = df_bin.loc[(df_bin['CodeDesc']==i) & (df_bin['forfeit']==0)]['BondAmtMade']
        t, p = ttest_ind(bond_forf, bond_no_forf, equal_var=False)
        d = {'code': i, 'bond_group': x}        
        tr = pd.DataFrame(data=d, index=[0, 1])        
        tr['t-stat']=round(t,2)
        tr['p-value']=round(p,2)
        df_list.append(tr)
tr = pd.concat(df_list)

tr = tr.rename(columns={'code': 'CodeDesc'})

tr = tr.drop_duplicates()

tr = tr.reset_index(drop=True)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [11]:
df_mean = df_mean.reset_index()

In [12]:
# n cases

In [13]:
df_out = df_mean.merge(tr, on=['CodeDesc', 'bond_group'])

df_count = df.groupby(['CodeDesc', 'bond_group']).agg({'CaseNumber': 'count'}).reset_index()

df_out = df_out.merge(df_count, on=['CodeDesc', 'bond_group'])

df_out = df_out.rename(columns={'CaseNumber': 'N Cases'})

In [14]:
# n forfeits

In [15]:
df_out = df_out.merge(df.groupby(['CodeDesc', 'bond_group']).agg({'forfeit': 'sum'}).reset_index(), on=['CodeDesc', 'bond_group'])

df_out = df_out.rename(columns={'forfeit': 'N Forfeitures'})

In [22]:
df_out['t-stat'] = df_out['t-stat'].fillna(0)

df_out['p-value'] = df_out['p-value'].fillna(1)

df_out = df_out.set_index(['CodeDesc', 'bond_group'])

df_out['Forfeit'] = df_out['Forfeit'].round(2)

df_out['No Forfeit'] = df_out['No Forfeit'].round(2)

# to excel

df_out.to_excel('mean_bond_forfeit_felony.xlsx')