In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from random import choices
from scipy import stats

sys.path.insert(0, '../src/')

In [None]:
crimes = pd.read_csv('../data/cleaned/crime-processed.csv')

In [None]:
crimes.head()

In [None]:
types = crimes.loc[crimes.Year != 2020].groupby('PredPol Deployed')['Crime Type'].value_counts(normalize=True).unstack().T
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1,1,1)
types.plot(kind='barh', ax=ax)
plt.xlabel('Proportion')
plt.title('Distribution of Crime Types (2010-2019)')
plt.show()

In [None]:
types

In [None]:
crimes['PredPol Deployed'].value_counts(normalize=True)

In [None]:
def test(crime_tp, prop_pp, prop_nonpp, pct_pp=0.562854, n=100000):
    """
    Tests a single type of crime.
    """
    NUM_POP = n
    PCT_PREDPOL = pct_pp
    PCT_NONPREDPOL = 1-PCT_PREDPOL
    VAR_PREDPOL = 1.0
    VAR_NONPREDPOL = 1.0

    n_predpol = int(NUM_POP * PCT_PREDPOL)
    n_notpredpol = int(NUM_POP * PCT_NONPREDPOL)

    # Generate data
    M = np.array([0] * n_notpredpol + [1] * n_predpol) # generate predpol variable

    # generate error terms: using proportion of crime type
    N_PREDPOL = choices([1,0], [prop_pp,1-prop_pp], k=n_predpol)
    N_NONPREDPOL = choices([1,0], [prop_nonpp,1-prop_nonpp],k= n_notpredpol)
    N = np.append(N_NONPREDPOL, N_PREDPOL)

    df = pd.DataFrame({'PredPol Deployed': M, crime_tp: N})
    res = stats.ttest_ind(df[df['PredPol Deployed']==1][crime_tp], df[df['PredPol Deployed']==0][crime_tp])
    
    return res.statistic, res.pvalue

In [None]:
for tp, row in types.iterrows():
    print('Crime Type: ', tp)
    stat, pval = test(tp, row[1], row[0])
    print('Statistic = ', stat)
    print('P-Value = {}\n'.format(pval))