In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from random import choices
from scipy import stats

sys.path.insert(0, '../src/')

In [None]:
crimes = pd.read_csv('../data/cleaned/crime-processed.csv')

In [None]:
crimes.head()

In [None]:
types = crimes.loc[crimes.Year != 2020].groupby('PredPol Deployed')['Crime Charge'].value_counts(normalize=True).unstack().T
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1,1,1)
types.plot(kind='barh', ax=ax)
plt.xlabel('Proportion')
plt.title('Distribution of Crime Types (2010-2019)')
plt.show()

In [None]:
types

In [None]:
crimes['PredPol Deployed'].value_counts(normalize=True)

In [None]:
def test(crime_tp, prop_pp, prop_nonpp, pct_pp=0.562854, n=100000):
    """
    Tests a single type of crime.
    """
    NUM_POP = n
    PCT_PREDPOL = pct_pp
    PCT_NONPREDPOL = 1-PCT_PREDPOL
    VAR_PREDPOL = 1.0
    VAR_NONPREDPOL = 1.0

    n_predpol = int(NUM_POP * PCT_PREDPOL)
    n_notpredpol = int(NUM_POP * PCT_NONPREDPOL)

    # Generate data
    M = np.array([0] * n_notpredpol + [1] * n_predpol) # generate predpol variable

    # generate error terms: using proportion of crime type
    N_PREDPOL = choices([1,0], [prop_pp,1-prop_pp], k=n_predpol)
    N_NONPREDPOL = choices([1,0], [prop_nonpp,1-prop_nonpp],k= n_notpredpol)
    N = np.append(N_NONPREDPOL, N_PREDPOL)

    df = pd.DataFrame({'PredPol Deployed': M, crime_tp: N})
    res = stats.ttest_ind(df[df['PredPol Deployed']==1][crime_tp], df[df['PredPol Deployed']==0][crime_tp])
    
    return res.statistic, res.pvalue

In [None]:
statvals = []
pvals = []
for tp, row in types.iterrows():
    print('Crime Type: ', tp)
    stat, pval = test(tp, row[1], row[0])
    statvals.append(stat.round(5))
    pvals.append(pval.round(5))
    print('Statistic = ', stat)
    print('P-Value = {}\n'.format(pval))
pd.DataFrame({'Statistic':statvals, 'P-Value':pvals}, 
             index=['Felony','Infraction','Misdemeanor','Others','Wobbler','Wobblette'])

Significant decrease in 'Financial/Other' and 'Inchoate' crimes. Significant increase in Personal crimes. Others are insignifcant and thus inconclusive.

We can then proceed to look at the distribution of crime type by divisions to hopefully get an answer to why certain divisions saw crime increases while others saw decreases.

In [None]:
div_types = crimes.loc[crimes.Year != 2020].groupby(['AREA NAME','PredPol Deployed'])['Crime Charge']\
    .value_counts(normalize=True).unstack()
div_types.head()

In [None]:
results = pd.DataFrame()
for div, df in div_types.groupby(level=0):
    print('Analyzing division: ', div)
    new_df = df.T
    vals = []
    pvals = []
    statvals = []
    for tp, row in new_df.iterrows():
        print('Crime Charge: ', tp)
        stat, pval = test(tp, row[1], row[0])
        pvals.append(pval)
        statvals.append(stat)
        if pval <= 0.05:
            if stat > 0:
                vals.append(1)
            elif stat < 0:
                vals.append(-1)
            else:
                vals.append(0)
        else:
            vals.append(0)
        print('Statistic = ', stat)
        print('P-Value = {}\n'.format(pval))
    #pd.DataFrame({'Statistic':statvals, 'P-Value':pvals}, index=TYPES).to_csv(os.path.join(outpath, '{}_tp_dist.csv'.format(div)), index=False)
    results[div] = vals
    print('-'*20)
    print('')

In [None]:
results.set_index(pd.Index(['Felony','Infraction','Misdemeanor','Others','Wobbler','Wobblette']), inplace=True)
results

In [None]:
sns.heatmap(results.T, annot=False, xticklabels=True, yticklabels=True)