# Bail Bonds EDA
1. Merge case and offence
2. Find top offences that you get a bond for (felonies usually)
3. Find bond distributions for the top 3
4. Try visualizations with and without log scale

In [None]:
# load "clean" datasets
import pandas as pd

cases = pd.read_csv(
    "/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/10_clean_data/cases_v01.csv",
    low_memory=False,
    index_col=[0],
)
offenses = pd.read_csv(
    "/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/10_clean_data/offenses_v01.csv",
    low_memory=False,
    index_col=[0],
)


In [None]:
# Check unit of analysis:
cases.loc[cases["case_id"] == 9902021700564]


In [None]:
# One case = Two Offenses
offenses.loc[offenses["case_id"] == 9902021700564]


In [None]:
# (1) Merge
case_off = pd.merge(
    cases, offenses, on="case_id", how="left", validate="m:m", indicator=True
)
case_off._merge.value_counts()


In [None]:
# ------------------------ SAVE THE MERGE BC IT TOOK A FOREVER ------------------------ #
case_off.drop("_merge", axis=1, inplace=True)

# case_off.to_csv("/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/10_clean_data/case_offense_v01.csv")


In [None]:
# validate there are two entries
case_off.loc[case_off["case_id"] == 9902021700564]


In [None]:
# Merge with offence code descriptions:
offense_desc = pd.read_csv(
    "/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/00_raw_data/ACIS_offence_codes.csv"
)

case_off_desc = pd.merge(
    left=case_off,
    right=offense_desc,
    left_on="charged_offense_code",
    right_on="CODE",
    how="left",
    validate="m:m",
    indicator=True,
)
case_off_desc._merge.value_counts()


In [None]:
offense_desc.sample(3)


CAREFUL HERE! There are 5997 codes that did not match.

For now, I will not do anything about that

In [None]:
# Top offences PLOT#1
top_offenses = pd.DataFrame(case_off_desc["Offense Description"].value_counts())

# It would be cool to show the driving-related ones in a different color...

import matplotlib.pyplot as plt
import numpy as np

%config InlineBackend.figure_format = 'retina'
top_offenses['%']=100 * top_offenses['Offense Description'] / np.sum(top_offenses['Offense Description'])

top_offenses.loc[top_offenses["%"] > 0.5]['%'].plot(
    kind="bar",
    figsize=(20, 8),
    xlabel="Offense Description",
    ylabel="% Frequency of Offense",
    title="Most Common Charged Offences in NC in years 2017-2021 \n(with more than 0.5% of total offense occurrences)",
    legend=False,
)


# Bond Types

In [None]:
case_off_desc["bond_type"].value_counts(dropna=False, normalize=True)*100


In [None]:
case_off_desc["has_bond"] = case_off_desc.bond_amount.notna() & (case_off_desc.bond_amount >= 1)
case_off_desc.groupby("bond_type")["has_bond"].mean().sort_values(ascending=False)

In [None]:
case_off_desc[case_off_desc.bond_type == "WPA"]["bond_amount"].describe()

In [None]:

case_off_desc.loc[~case_off_desc["bond_amount"].isna(), 'disposition'].value_counts()
case_off_desc.loc[~case_off_desc["bond_amount"].isna(), 'min_sentence'].value_counts()

In [None]:
# (2) Find top 5 offences you get a bond for
#           Note: Bond amount may be missing instead of being 0 which will skew the data.

import matplotlib.pyplot as plt
import numpy as np
# keep only bond amounts that are not NA
bond_data = case_off_desc[~case_off_desc["bond_amount"].isna()]


# these are the ones that have the most bond associated records
top_offences_with_bail = pd.DataFrame(bond_data["Offense Description"].value_counts())
top_offences_with_bail["%"] = (
    100
    * top_offences_with_bail["Offense Description"]
    / np.sum(top_offences_with_bail["Offense Description"])
)
top_offences_with_bail.loc[top_offences_with_bail["%"] > 0.5]["%"].plot(
    kind="bar",
    figsize=(20, 8),
    xlabel="Offense Description",
    ylabel="Frequency (Number of Occurences)",
    title="Most Common Charged Offences in NC that are assigned a bail bond (>=$0) in years 2017-2021 \n(with more than 0.5% of total offense occurrences)",
    legend=False,
)


# Deep dive into "DRIVING WHILE IMPAIRED"

In [None]:
top_five_offences_with_bonds = [5405, 2322, 3401, 9955, 2632]

# Lets just do the first one, for now
case_bond = case_off_desc.loc[case_off_desc["charged_offense_code"] == 5405]

percent_missing = (
    case_bond["bond_amount"].isnull().sum() * 100 / len(case_bond["bond_amount"])
)
percent_missing

# This is a problem, the data may be skewed :(
# We can assume missing data is not caused by anything related to gender (missing for random columns or bc charges were dropped?)


In [None]:
# lets look at race dist of non-missing amounts vs. missing amounts:

# not missing
not_missing_data_bail = pd.DataFrame(
    case_bond.loc[~case_bond["bond_amount"].isnull(), "race"].value_counts()
)
not_missing_data_bail

not_missing_data_bail["%_notmissing"] = (
    100 * not_missing_data_bail["race"] / len(case_bond.index)
)

# missing
missing_data_bail = pd.DataFrame(
    case_bond.loc[case_bond["bond_amount"].isnull(), "race"].value_counts()
)
missing_data_bail
missing_data_bail["%_missing"] = 100 * missing_data_bail["race"] / len(case_bond.index)

missing_data_bail = pd.concat([missing_data_bail, not_missing_data_bail], axis=1)
missing_data_bail[["%_missing", "%_notmissing"]].plot(
    kind="bar",
    figsize=(20, 8),
    xlabel="race",
    ylabel="% of entries",
    title="Are bail $ amounts missing for some races more than others?? \n (doesnt look like it!)",
    legend=True,
    # stacked=True,
)


In [None]:
case_off_desc["bond_type"].value_counts()

In [None]:
case_bond.bond_type.value_counts()


Bond Type Code<br>
* CSH - Cash 
* CUS - Custody Release
* PTR - Pretrial Release  
* SEC - Secured
* UNS - Unsecured 
* WPA - Written Promise to Appear

"There are two types of bonds – secured and unsecured. A secured bond means that you actually pay money or bail property to secure your release. An unsecured bond or surety bond means you sign a document that says you will pay a certain amount of money if the defendant breaks his/her bond conditions."

In [None]:
# So, because I saw that the distributions by race of missing vs. not missing bond amount is fairly similar, I will just use the ones that are not missing
case_bond = case_bond.loc[case_bond["bond_amount"] >= 0]
# case_bond = case_bond.loc[case_bond['bond_type'].isin(['CSH', 'CUS', 'PTR', 'SEC', 'UNS', 'WPA'])]

# Prepare loudness by genre datasets
white_female = case_bond[case_bond["race_gender"] == "White Female"]["bond_amount"]
white_male = case_bond[case_bond["race_gender"] == "White Male"]["bond_amount"]
black_female = case_bond[case_bond["race_gender"] == "Black Female"]["bond_amount"]
black_male = case_bond[case_bond["race_gender"] == "Black Male"]["bond_amount"]
hispanic_female = case_bond[case_bond["race_gender"] == "Hispanic Female"][
    "bond_amount"
]
hispanic_male = case_bond[case_bond["race_gender"] == "Hispanic Male"]["bond_amount"]


# Visualize petal length distribution for all species
fig, ax = plt.subplots(figsize=(15, 7))

# Remove y-axis tick marks
ax.yaxis.set_ticks_position("none")
# Add major gridlines in the y-axis
ax.grid(color="grey", axis="y", linestyle="-", linewidth=0.25, alpha=0.5)
# Set plot title
ax.set_title("Distribution of Bail Bond Amounts (in $) by Race and Gender")
plt.xlabel("Race/Gender Combination")
plt.ylabel("Bond Amount in $")
# Set names as labels for the boxplot
dataset = [
    white_female,
    white_male,
    black_female,
    black_male,
    hispanic_female,
    hispanic_male,
]
ax.boxplot(
    dataset,
    labels=[
        "white_female",
        "white_male",
        "black_female",
        "black_male",
        "hispanic_female",
        "hispanic_male",
    ],
    showfliers=False,
)
plt.show()


In [None]:
case_bond.loc[case_bond["bond_type"] == "CUS", "bond_amount"]


In [None]:
# Prepare & generate datasets
# LOG
white_female = np.log10(
    case_bond[case_bond["race_gender"] == "White Female"]["bond_amount"]
)
white_male = np.log10(
    case_bond[case_bond["race_gender"] == "White Male"]["bond_amount"]
)
black_female = np.log10(
    case_bond[case_bond["race_gender"] == "Black Female"]["bond_amount"]
)
black_male = np.log10(
    case_bond[case_bond["race_gender"] == "Black Male"]["bond_amount"]
)
hispanic_female = np.log10(
    case_bond[case_bond["race_gender"] == "Hispanic Female"]["bond_amount"]
)
hispanic_male = np.log10(
    case_bond[case_bond["race_gender"] == "Hispanic Male"]["bond_amount"]
)


# Visualize petal length distribution for all species
fig, ax = plt.subplots(figsize=(15, 7))

# Remove y-axis tick marks
ax.yaxis.set_ticks_position("none")
# Add major gridlines in the y-axis
ax.grid(color="grey", axis="y", linestyle="-", linewidth=0.25, alpha=0.5)
# Set plot title
ax.set_title("Distribution of Bail Bond Amounts (in $) by Race and Gender")
plt.xlabel("Race/Gender Combination")
plt.ylabel("Bond Amount in $")
# Set names as labels for the boxplot
dataset = [
    white_female,
    white_male,
    black_female,
    black_male,
    hispanic_female,
    hispanic_male,
]
ax.boxplot(
    dataset,
    labels=[
        "white_female",
        "white_male",
        "black_female",
        "black_male",
        "hispanic_female",
        "hispanic_male",
    ],
    showfliers=True,
)
plt.show()


# WHAT IF I GET RID OF OUTLIERS?


In [None]:
# REMOVED OUTLIERS (CHEATING)
fig, ax = plt.subplots(figsize=(15, 7))

# Remove y-axis tick marks
ax.yaxis.set_ticks_position("none")
# Add major gridlines in the y-axis
ax.grid(color="grey", axis="y", linestyle="-", linewidth=0.25, alpha=0.5)
# Set plot title
ax.set_title("Distribution of Bail Bond Amounts (in $) by Race and Gender")
plt.xlabel("Race/Gender Combination")
plt.ylabel("Bond Amount in $")
# Set names as labels for the boxplot
dataset = [
    white_female,
    white_male,
    black_female,
    black_male,
    hispanic_female,
    hispanic_male,
]
ax.boxplot(
    dataset,
    labels=[
        "white_female",
        "white_male",
        "black_female",
        "black_male",
        "hispanic_female",
        "hispanic_male",
    ],
    showfliers=False,
)
plt.show()


In [None]:
# WHAT IF I ONLY LOOK AT ONE TYPE OF BOND

white_female = np.log10(
    case_bond[case_bond["race_gender"] == "White Female"]["bond_amount"]
)
white_male = np.log10(
    case_bond[case_bond["race_gender"] == "White Male"]["bond_amount"]
)
black_female = np.log10(
    case_bond[case_bond["race_gender"] == "Black Female"]["bond_amount"]
)
black_male = np.log10(
    case_bond[case_bond["race_gender"] == "Black Male"]["bond_amount"]
)
hispanic_female = np.log10(
    case_bond[case_bond["race_gender"] == "Hispanic Female"]["bond_amount"]
)
hispanic_male = np.log10(
    case_bond[case_bond["race_gender"] == "Hispanic Male"]["bond_amount"]
)


# Visualize petal length distribution for all species
fig, ax = plt.subplots(figsize=(15, 7))

# Remove y-axis tick marks
ax.yaxis.set_ticks_position("none")
# Add major gridlines in the y-axis
ax.grid(color="grey", axis="y", linestyle="-", linewidth=0.25, alpha=0.5)
# Set plot title
ax.set_title("Distribution of Bail Bond Amounts (in $) by Race and Gender")
plt.xlabel("Race/Gender Combination")
plt.ylabel("Bond Amount in $")
# Set names as labels for the boxplot
dataset = [
    white_female,
    white_male,
    black_female,
    black_male,
    hispanic_female,
    hispanic_male,
]
ax.boxplot(
    dataset,
    labels=[
        "white_female",
        "white_male",
        "black_female",
        "black_male",
        "hispanic_female",
        "hispanic_male",
    ],
    showfliers=True,
)
plt.show()


### Interpretation:

I want to say that these plots mean nothing, but if the NAs were truly random (and they seem to be), this is a hint that maybe the distributions arent that different. The differences are subtil. 
Note that:
* Distributions of bail amounts in % for each race-gender combo are NOT normal (they dont look normal)
* This is not statistical proof
* This is just one type of offense

In the next few boxes I am going to do a regression so we can see if race and gender have significant effects on bail amounts controling for type of offense and city of address of the defendant (i know that there's a lot more to it, but this is the information available)

# Regression (are races / genders correlated to higher or lower bail amouts?)
### Only for the most common bail-setting offense = DRIVING WHILE IMPAIRED

In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
case_off_desc = pd.read_csv(
    "/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/10_clean_data/case_offense_v01.csv",
    low_memory=False,
)


In [None]:
pd.set_option("display.max_columns", None)
case_off_desc = case_off_desc.drop(
    [
        "Unnamed: 0",
        "key_county_num.1",
        "date_of_birth",
        "in_jail_indicator",
        "jail_release_date",
        "jail_commited_date",
    ],
    axis=1,
)
case_off_desc.sample(5)


In [None]:
# Merge with offence code descriptions:
offense_desc = pd.read_csv(
    "/Users/clarissaache/Documents/Capstone/JUSTFAIR_capstone/00_raw_data/ACIS_offence_codes.csv"
)

offense_desc.sample(5)
offense_desc = offense_desc[
    ["CODE", "Offense Description", "T", "NC General Statute", "CL"]
]


In [None]:
case_off_desc = pd.merge(
    left=case_off_desc,
    right=offense_desc,
    left_on="charged_offense_code",
    right_on="CODE",
    how="left",
    validate="m:m",
    indicator=True,
)
case_off_desc._merge.value_counts()


In [None]:
# Use only bond amounts that are not NA
bond_regression = case_off_desc.loc[~case_off_desc["bond_amount"].isna()]
round(pd.crosstab(bond_regression['race'], bond_regression['bond_type'], normalize='index')*100,2)

In [None]:
# Use only bond amounts that are not NA
bond_regression = case_off_desc.loc[~case_off_desc["bond_amount"].isna()]

# Use only bond types that are correct
bond_regression = bond_regression.loc[
    bond_regression["bond_type"].isin(["CSH", "SEC", "UNS", "WPA"])
]
# Renaming that dumb column that has a space
bond_regression.rename(columns={"Offense Description": "off_description"}, inplace=True)


In [None]:
bond_regression["bond_type"].value_counts()

In [None]:
round(pd.crosstab(bond_regression['race'], bond_regression['bond_type'], normalize='index')*100,2)
# I dont know what this means

In [None]:
# OPTION 1 ------> Subset to top % of bail-requiring charges
import numpy as np
# keep only bond amounts that are not NA
bond_data = case_off_desc[~case_off_desc["bond_amount"].isna()]


In [None]:

# these are the ones that have the most bond associated records
top_offences_with_bail = pd.DataFrame(bond_data["Offense Description"].value_counts())
top_offences_with_bail["%"] = (
    100
    * top_offences_with_bail["Offense Description"]
    / np.sum(top_offences_with_bail["Offense Description"])
)
# Subset to top % of bail-requiring charges
bond_regression = bond_regression.loc[
    bond_regression["off_description"].isin(
        top_offences_with_bail.loc[top_offences_with_bail["%"] > 1].index.to_list()
    )
]


In [None]:
# OPTION 2 ------> Subset for ONE Charge (Driving WHile Impaired)
# bond_regression = bond_regression.loc[bond_regression["charged_offense_code"] == 5405]


In [None]:
# Reducing gender to F and M
bond_regression.sex.value_counts()
bond_regression = bond_regression[bond_regression["sex"].isin(["M", "F"])]


In [None]:
# Reducing race to white, black, hispanic (other subsets may not be as representative)
bond_regression.race.value_counts()
bond_regression = bond_regression[bond_regression["race"].isin(["W", "B", "H"])]


In [None]:
# Cities with the largest amount of these crimes
# cities = pd.DataFrame(bond_regression.def_city_address.value_counts())
# cities['%']=100 * cities['def_city_address'] / np.sum(cities['def_city_address'])

# bond_regression = bond_regression.loc[bond_regression['def_city_address'].isin(cities.loc[cities["%"] > 1].index.to_list())]


In [None]:
# ADD county number (represents court?)
bond_regression["key_county_num"].isna().sum()  # no NAs
bond_regression["key_county_num"] = bond_regression["key_county_num"].astype(str)
bond_regression["key_county_num"].dtypes


In [None]:
# court type
bond_regression["court_type"].isna().sum()  # no NAs
bond_regression["court_type"].value_counts()


In [None]:
# So, I thought because the offense is the same it would have the same class!

bond_regression.loc[bond_regression["offense_class"] == "1.0", "offense_class"] = "1"
bond_regression.loc[bond_regression["offense_class"] == "2.0", "offense_class"] = "2"
bond_regression.loc[bond_regression["offense_class"] == "3.0", "offense_class"] = "3"

bond_regression = bond_regression.loc[
    bond_regression["offense_class"].isin(["1", "2", "3"])
]
bond_regression["offense_class"].value_counts(dropna=False)
#also count NAs
pd.crosstab(bond_regression["offense_class"], bond_regression["off_description"])


In [None]:
bond_regression["offense_class"].value_counts(dropna=False) # its missing for too many rows, also possible post treatment bias (this is the perceived severity)

In [None]:
bond_regression.sample(4)

In [None]:
# year!
bond_regression['case_creation_date']=pd.to_datetime(bond_regression['case_creation_date'], format='%Y-%m-%d', errors='coerce')
bond_regression['process_service_date']=pd.to_datetime(bond_regression['process_service_date'], format='%Y-%m-%d', errors='coerce')
bond_regression['year'] = pd.DatetimeIndex(bond_regression['process_service_date']).year
bond_regression.sample(3)

In [None]:
import statsmodels.formula.api as smf

m = smf.ols(
    "bond_amount ~ C(race) * C(sex) + C(bond_type) + C(off_description) + + C(key_county_num) + court_type + year",
    bond_regression,
).fit()
m.summary()

# NOT ADDING DISPOSITION BC THAT IS POST TREATMENT EFFECT


In [None]:
# Panel OLS
bond_regression_multiindex = bond_regression.set_index(['off_description', 'year'])
bond_regression_multiindex.head()

In [None]:
from linearmodels import PanelOLS
mod = PanelOLS.from_formula('bond_amount ~ 1 + C(race) * C(sex) + C(bond_type) +  C(key_county_num)+ court_type + EntityEffects + TimeEffects',
                            data=bond_regression_multiindex)
mod.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)