In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
arrests=pd.read_csv('arrests.csv')
# 
arrests = arrests[pd.notnull(arrests['OFFENSES'])]

print(arrests['PK'].unique().shape)
print(arrests['CCR'].unique().shape)
arrests=arrests.drop_duplicates(subset='CCR',keep='first')
arrests.shape

arrests['ARRESTTIME'] = pd.to_datetime(arrests['ARRESTTIME'], yearfirst=True)
arrests= arrests.sort_values(by= ['ARRESTTIME']).reset_index(drop=True)

print('arrest totals per year')
print('2014: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2014])))
print('2015: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2015])))
print('2016: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2016])))
print('2017: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2017])))
print('2018: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2018])))
print('2019: '+str(len(arrests.loc[arrests['ARRESTTIME'].dt.year==2019])))

arrests=arrests.loc[arrests['ARRESTTIME']>= '2016-09'].reset_index(drop=True)

In [None]:
# Police zone coordinates
# 1
# Latitude:	40.4569565
# Longitude:	-80.0160637
# 2
# Latitude:	40.4435808
# Longitude:	-79.9800111
# 3
# Latitude:	40.421685
# Longitude:	-79.9924763
# 4
# Latitude:	40.4414787
# Longitude:	-79.9212047
# 5
# Latitude:	40.4742729
# Longitude:	-79.9091618
# 6
# Latitude:	40.4420734
# Longitude:	-80.0322484


X_coord = {'Zone 1': -80.0160637 , 'Zone 2': -79.9800111, 'Zone 3': -79.9924763, 'Zone 4':-79.9212047 ,
           'Zone 5':-79.9091618 , 'Zone 6':-80.0322484 }

Y_coord = {'Zone 1': 40.4569565, 'Zone 2': 40.4435808 , 'Zone 3': 40.421685, 'Zone 4': 40.4414787 ,
           'Zone 5': 40.4742729, 'Zone 6':40.4420734 }
    

arrests["ZONEX"] = arrests['INCIDENTLOCATION'].map(X_coord)
arrests['ZONEY'] = arrests['INCIDENTLOCATION'].map(Y_coord)


arrests['X']=arrests['X'].fillna(arrests['ZONEX'], axis=0)
arrests['Y']=arrests['Y'].fillna(arrests['ZONEY'], axis=0)

# arrests.isna().sum()
arrests['NUMOFFENSES'] = arrests['OFFENSES'].apply(lambda x: len(x.split(' / ')))

arrests.loc[arrests['X'].isna()]['INCIDENTLOCATION'].unique()
arrests=arrests[arrests['INCIDENTLOCATION'] !='Zone OSC']
arrests=arrests[arrests['INCIDENTLOCATION'] !='Zone ']

In [None]:
df=[]
for index, row in arrests.iterrows():
    # each offense is separated by a ''/''.  Split on this.
    charges = row['OFFENSES'].split(' / ')
    for charge in charges:
#         print(charge)
        row['CHARGES']=charge
        df.append(row)
        row=row.drop('CHARGES',axis=0)
        
arrests_cleaned=pd.DataFrame(df).reset_index(drop=True)
isinstance(arrests_cleaned, pd.DataFrame)
# 7.5 min
# 

In [None]:
arrests_cleaned['CHARGES'] = arrests_cleaned['CHARGES'].str.replace('.','',regex=True) # remove period from string
arrests_cleaned['ARRESTCODE'] = arrests_cleaned['CHARGES'].apply(lambda x: x.split(" ",1)[0])
arrests_cleaned['ARRESTDESCRIPT'] = arrests_cleaned['CHARGES'].apply(lambda x: x.split(" ",1)[1])


arrests_cleaned = arrests_cleaned.drop(['CHARGES'],axis=1)
arrests_cleaned = arrests_cleaned.drop(['OFFENSES'],axis=1)
arrests_cleaned = arrests_cleaned.drop(["ZONEX"], axis=1)
arrests_cleaned = arrests_cleaned.drop(["ZONEY"], axis=1)

In [None]:
arrests_cleaned.to_csv('arrests_cleaned.csv', index=False)

In [None]:
arrests_cleaned=pd.read_csv('arrests_cleaned.csv')

In [None]:
# print(len(arrests_cleaned['ARRESTDESCRIPT'].unique()))
# arrests_cleaned['ARRESTTIME']=pd.to_datetime(arrests_cleaned['ARRESTTIME'], yearfirst=True,errors='coerce')
arrests_cleaned.index = arrests_cleaned['ARRESTTIME']

In [None]:
charge_counts=arrests_cleaned['ARRESTDESCRIPT'].value_counts()
charge_counts=pd.DataFrame(charge_counts).reset_index()
charge_counts=charge_counts.rename(columns={"index": "Crime", "ARRESTDESCRIPT": "Count"})
charge_counts['pct']=(charge_counts['Count']/charge_counts['Count'].sum())*100
charge_counts['cumsum']=charge_counts['Count'].cumsum()
charge_counts['cumsumpct']=charge_counts['pct'].cumsum()
charge_counts.head(20)
2.38+6.16+5.76+3.85

In [None]:
fig = plt.figure(figsize=(24, 24), dpi=200)
barplt = fig.add_subplot(1, 1, 1)  # create an axes object in the figure

barplt = sns.barplot(y="Crime", x="Count", data=charge_counts.loc[0:19])

fontsize=44
barplt.set_title("Pittsburgh's Top 20 Most Reported Charges Upon \n Police Arrests for Sep 2016-Present",fontweight='bold',fontsize=fontsize)
barplt.set_ylabel('',fontweight='bold',fontsize=fontsize)
barplt.set_xlabel('Count',fontweight='bold',fontsize=fontsize)
    
barplt.set_xticks(np.arange(0,6000,500))

fontsize=36
for tick in barplt.xaxis.get_ticklabels():
        tick.set(rotation=-30)
        tick.set_fontsize(fontsize)
        tick.set_fontweight('bold')
for tick in barplt.yaxis.get_ticklabels():
    tick.set_fontsize(fontsize)
    tick.set_fontweight('bold')
    
barplt.grid(linewidth=1)
# set bars in front of grid lines
barplt.set_axisbelow(True)
                 
# Add transparency to colors if need be
for patch in barplt.artists:
    r, g, b, a = patch.get_facecolor()
    patch.set_facecolor((r, g, b, 1))

# add perecentages
j=0
for i in barplt.patches:
    barplt.text(i.get_width()+75, i.get_y()+.5, 
                str(round(charge_counts['pct'].iloc[j],2))+'%',
                fontsize=36, fontweight='bold', color='black')
    j=j+1

plt.savefig('arrests_top_20_total_pct.png',bbox_inches='tight')
# plt.show()

In [None]:
crime_name=charge_counts['Crime'].iloc[0]
crime_spec=arrests_cleaned.loc[arrests_cleaned['ARRESTDESCRIPT']== crime_name]
monarr=crime_spec.resample('M').agg(dict(ARRESTDESCRIPT='count'))
monarr['ARRESTTIME']=monarr.index



mj=monarr.plot(figsize=(10,6),x='ARRESTTIME', y='ARRESTDESCRIPT', legend=False, marker='o',color='g', clip_on = False)
mj.set_xlabel('Month',fontsize=22,fontweight='bold')
mj.set_ylabel('Monthly Total',fontsize=22,fontweight='bold')
mj.set_title('Monthly Police Arrest Totals with \n'+crime_name+' as a Charge',fontsize=20, fontweight='bold')
mj.tick_params(axis='both', which='major', labelsize=16)
mj.tick_params(axis='x', which='minor', labelsize=17)
for tick in mj.xaxis.get_ticklabels():
        tick.set(rotation=0)
        tick.set_fontweight('bold')
for tick in mj.yaxis.get_ticklabels():
    tick.set_fontweight('bold')
mj.grid(linewidth=.5)
mj.set_yticks(np.arange(70,160,10))

plt.savefig('arrests_specific.png',bbox_inches='tight',dpi=200)
crime_name

plt.figure(figsize=(8,6))

mj= sns.countplot(x="NUMOFFENSES", data=arrests_cleaned.loc[arrests_cleaned['ARRESTDESCRIPT']==crime_name])
mj.set_xlabel('Number of Offenses Per Individual',fontsize=20,fontweight='bold')
mj.set_ylabel('Count',fontsize=20,fontweight='bold')
mj.set_title('Individuals Charged with \n'+crime_name+' Upon Arrest',fontsize=20, fontweight='bold')
mj.tick_params(axis='both', which='major', labelsize=16)
mj.tick_params(axis='x', which='minor', labelsize=17)
for tick in mj.xaxis.get_ticklabels():
        tick.set(rotation=0)
        tick.set_fontweight('bold')
for tick in mj.yaxis.get_ticklabels():
    tick.set_fontweight('bold')
mj.grid(linewidth=.5)
mj.set_axisbelow(True)
mj.set_yticks(np.arange(0,2400,200))

# mjh.set_xlabel('aka',fontsize=12)
plt.savefig('arrestsmjbarg.png',bbox_inches='tight')

h=arrests_cleaned[arrests_cleaned['ARRESTDESCRIPT'].isin(crime_names) ][['PK','ARRESTDESCRIPT']]
h=arrests_cleaned[['PK','ARRESTDESCRIPT']]
h.head()

crime_merge = pd.merge(crime_spec,arrests_cleaned[['PK','ARRESTDESCRIPT']], on='PK', how='inner')
crime_merge=crime_merge.set_index('ARRESTTIME',drop=False)
crime_merge = crime_merge[crime_merge['ARRESTDESCRIPT_y'] != crime_name]
crime_merge = crime_merge.drop_duplicates(subset='CCR',keep='first')

mj_many=crime_merge.resample('M').agg(dict(ARRESTDESCRIPT_x='count'))
mj_many['ARRESTTIME']=mj_many.index
mj_many['single_charge']=monarr['ARRESTDESCRIPT']-mj_many['ARRESTDESCRIPT_x']
# mj_many['single_charge'].sum() # checks out with barplot above


ax = plt.gca()
monarr.plot(figsize=(12,6),x='ARRESTTIME', y='ARRESTDESCRIPT', label='Total arrests', marker='o',color='b', clip_on = False,ax=ax)
mj_many.plot(figsize=(10,6),x='ARRESTTIME', y='ARRESTDESCRIPT_x', label='Arrests for simple assault \n + other charges', marker='o',color='g', clip_on = False,ax=ax)
mj_many.plot(figsize=(10,6),x='ARRESTTIME', y='single_charge', label='Arrests for simple assualt only', marker='o',color='r', clip_on = False,ax=ax)

ax.legend(loc=2,bbox_to_anchor=(1,1),fontsize=14)
ax.set_xlabel('Month',fontsize=22,fontweight='bold')
ax.set_ylabel('Monthly Total',fontsize=22,fontweight='bold')
ax.set_title('Monthly Police Arrest Totals with \n'+crime_name+' as Charge',fontsize=20, fontweight='bold')
ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='x', which='minor', labelsize=17)
for tick in ax.xaxis.get_ticklabels():
        tick.set(rotation=0)
        tick.set_fontweight('bold')
for tick in ax.yaxis.get_ticklabels():
    tick.set_fontweight('bold')
ax.grid(linewidth=.5)
ax.set_yticks(np.arange(0,160,10))


plt.savefig('mj_explained.png',bbox_inches='tight',dpi=200)
crime_name

In [None]:
x1=1
y1=1
fig = plt.figure(figsize=(32,16), dpi=96)

for pt, crime in enumerate(charge_counts['Crime'].iloc[0:10]):
    pt=pt+1

    plt.subplot(5,2,pt)
    total_crime = arrests_cleaned.loc[arrests_cleaned['ARRESTDESCRIPT']==crime]
    monthly_crime=total_crime.resample('M').agg(dict(ARRESTDESCRIPT='count'))
    monthly_crime['ARRESTTIME']=monthly_crime.index
    mc = sns.lineplot(x='ARRESTTIME', y='ARRESTDESCRIPT',data=monthly_crime, 
                      marker='o',color='g', clip_on = False,linewidth=3, markersize=8)
    mc.set_yticks(np.arange(0, max(monthly_crime['ARRESTDESCRIPT'])+20, 20))
    
    mc.set_title(crime,fontsize=20,fontweight='bold')
    mc.set_ylabel('Monthly Total',fontsize=20,fontweight='bold')
    mc.set_xlabel('')#,fontsize=20,fontweight='bold')

    mc.axes.get_xaxis().set_visible(True)
# plt.savefig('top20month.png',bbox_inches='tight')
plt.show()

In [None]:
upto_feb2017=ax.loc['2016-01':'2017-02']
upto_feb2017['ID']='up to feb 2017'
after_feb2017=ax.loc['2017-03':]
after_feb2017['ID'] = 'after feb 2017'
labeled=pd.concat([upto_feb2017,after_feb2017])

plt.figure(figsize=(6,6), dpi=96)
sns.set_style("whitegrid")
boxplt = sns.boxplot(x='ID',y='ARRESTDESCRIPT',
                     data=labeled,
                     palette='Set1',
                     width=0.7,
                     linewidth=3,
                     fliersize=0,
                     whis=1.5)

fontsize=18
boxplt.set_title('Monthly Police Incident Totals of \n Marjiuana: Possession Small Amount (< 30 grams)',fontsize=18, fontweight='bold')
boxplt.set_ylabel('Number of Incidents',fontweight='bold',fontsize=fontsize)
boxplt.set_xlabel('',fontweight='bold',fontsize=fontsize)
    
# barplt.set_xticks(np.arange(0,8,1))

fontsize=14
for tick in boxplt.xaxis.get_ticklabels():
        tick.set(rotation=0)
        tick.set_fontsize(fontsize)
        tick.set_fontweight('bold')
for tick in boxplt.yaxis.get_ticklabels():
    tick.set_fontsize(fontsize)
    tick.set_fontweight('bold')
    
    # Add transparency to colors
for patch in boxplt.artists:
    r, g, b, a = patch.get_facecolor()
    patch.set_facecolor((r, g, b, 1))
    
    
# Calculate number of obs per group & median to position labels
medians = labeled.groupby(['ID'])['ARRESTDESCRIPT'].median().values
nobs = labeled.groupby("ID").size().values
plt.text(0,medians[1]-3.5, 'months = ' + str(nobs[1]), horizontalalignment='center', color='w', fontsize=15, fontweight='bold')
plt.text(1,medians[0]+.5, 'months = ' + str(nobs[0]), horizontalalignment='center', color='w', fontsize=15, fontweight='bold')

xtic=['Before Feb 2017', 'After Feb 2017']
boxplt.set_xticklabels(xtic)
plt.savefig('boxplt2017.png',bbox_inches='tight')

# add ti
plt.show()

import scipy



fig= plt.figure(figsize=(4,4), dpi=96)
b4 = scipy.stats.probplot(upto_feb2017['ARRESTDESCRIPT'], plot= plt)
plt.title('Q-Q Plot of Monthly Total Marjiuana \n Police Incidents Before Feb 2017', fontweight='bold')
plt.savefig('b4feb2017.png',bbox_inches='tight')
plt.show()

fig= plt.figure(figsize=(4,4), dpi=96)
aft = scipy.stats.probplot(after_feb2017['ARRESTDESCRIPT'], plot= plt)
plt.title('Q-Q Plot of Monthly Total \n Marjiuana Police Incidents After Feb 2017', fontweight='bold')
plt.savefig('afterfeb2017.png',bbox_inches='tight')
plt.show()

print('stats up to Feb 2017 : ' , upto_feb2017.describe())
print(after_feb2017.describe())

__, p_value_b4=scipy.stats.shapiro(upto_feb2017['ARRESTDESCRIPT'])
__, p_value_after=scipy.stats.shapiro(after_feb2017['ARRESTDESCRIPT'])

print(p_value_b4)
print(p_value_after)

# have unequal sampe size and varance, use Welch's t test
scipy.stats.ttest_ind(upto_feb2017['ARRESTDESCRIPT'], after_feb2017['ARRESTDESCRIPT'], equal_var = False)