## The below script loads data from Calls_for_service file and merges with data from max_cfs_ucr_categories file.
Following constraints are applied:
- Records year range = 2012 - 2016
- Records Disposition = RTF 
- Records CrimeType = Violent Crime

In [1]:
import os
import csv
import zipfile
import string
import pandas as pd
import datetime as dt
import numpy as np

## Data Loading

### Load Calls-for-Service Data

In [2]:
# Set location of file
path = os.path.join("../Datasets/Beat_Data/")
path

'../Datasets/Beat_Data/'

In [3]:
# Get filenames
filenames = os.listdir(path)
filenames

['stop_search_beat.csv', 'crime_beat.csv', 'qol_beat_3zip.csv', 'ReadMe']

In [4]:
cfs_df = pd.read_csv(path+"crime_beat.csv")

In [5]:
# Display top 5 rows
cfs_df.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,BLOCK_ADDRESS,Zip,PoliceDistrict,Location,CrimeType,Description,Date,latitude,longitude,Beat_New
0,A0514814,65.0,SIMPLE ROBBERY,1B,35.0,SIMPLE BATTERY,2A,3683377,531825,01/04/2014 11:42:30 PM,...,St Peter & River,70130,8,"(29.95627848, -90.06213187)",VIOLENT CRIME,SIMPLE ROBBERY,2014-01-04,29.956278,-90.062132,8B02
1,A1137614,65.0,SIMPLE ROBBERY,2A,65.0,SIMPLE ROBBERY,2A,3679066,525743,01/09/2014 08:20:55 PM,...,014XX Saint Charles Ave,70130,6,"(29.93968461, -90.07595579)",VIOLENT CRIME,SIMPLE ROBBERY,2014-01-09,29.939685,-90.075956,6G02
2,A1371814,65.0,SIMPLE ROBBERY,1B,35.0,SIMPLE BATTERY,1D,3682713,531408,01/11/2014 02:19:56 PM,...,Decatur St & St Louis St,70130,8,"(29.95515119, -90.06424170)",VIOLENT CRIME,SIMPLE ROBBERY,2014-01-11,29.955151,-90.064242,8B02
3,A2136614,65.0,SIMPLE ROBBERY,1B,65.0,SIMPLE ROBBERY,1B,3668999,535899,01/17/2014 12:10:08 PM,...,040XX S Carrollton Ave,70119,1,"(29.96791191, -90.10739402)",VIOLENT CRIME,SIMPLE ROBBERY,2014-01-17,29.967912,-90.107394,1K02
4,A2697014,65.0,SIMPLE ROBBERY,1B,65.0,SIMPLE ROBBERY,1B,3674619,534666,01/21/2014 08:43:35 PM,...,Jane Pl & Palmyra St,70119,1,"(29.96435362, -90.08968824)",VIOLENT CRIME,SIMPLE ROBBERY,2014-01-21,29.964354,-90.089688,1L01


In [6]:
# Number of records
len(cfs_df)

2586

### Load Categories Data

In [7]:
cfs_df.TimeArrive = pd.to_datetime(cfs_df.TimeArrive)
cfs_df.TimeCreate= pd.to_datetime(cfs_df.TimeCreate)
cfs_df.TimeClosed = pd.to_datetime(cfs_df.TimeClosed)
cfs_df.TimeDispatch = pd.to_datetime(cfs_df.TimeDispatch)

Which beats are notorius over the years.

In [8]:
# cfs_df.Zip.unique()
# cfs_df.groupby([cfs_df.Zip,cfs_df.Beat]).size().nlargest(5)
top_5_beats = cfs_df['Beat'].groupby(cfs_df['Zip']).value_counts()
top_5_beats.groupby(level=0).nlargest(5)

Zip    Zip    Beat
70117  70117  5G02    79
              8F04    73
              5G01    59
              5E01    58
              5C02    55
70119  70119  5K02    80
              1I03    60
              1K01    50
              1C02    47
              5K01    47
70130  70130  8C01    42
              8C02    39
              8B02    38
              6B02    36
              8G04    26
Name: Beat, dtype: int64

In [9]:
two_zips_df = cfs_df[(cfs_df.Zip==70117) | (cfs_df.Zip==70119)]

Creating two calls for service dataframe for two zips 70119 and 70117

In [10]:
zip_1_df = two_zips_df[two_zips_df.Zip==70117]
zip_1_df = zip_1_df.groupby(zip_1_df.TimeCreate).size().reset_index()
zip_2_df = two_zips_df[two_zips_df.Zip==70119]
zip_2_df = zip_2_df.groupby(zip_2_df.TimeCreate).size().reset_index()

In [11]:
zip_1_df.head()

Unnamed: 0,TimeCreate,0
0,2014-01-01 10:19:51,1
1,2014-01-01 22:49:14,1
2,2014-01-02 19:19:33,1
3,2014-01-04 08:50:16,1
4,2014-01-04 18:40:20,1


In [12]:
zip_2_df.head()

Unnamed: 0,TimeCreate,0
0,2014-01-01 00:35:13,1
1,2014-01-02 18:48:31,1
2,2014-01-03 10:23:54,1
3,2014-01-04 12:18:14,1
4,2014-01-05 22:21:56,1


Building two dataframes specific to crime prone beats

In [13]:
e_p_b_one = cfs_df[cfs_df.Beat=='5K02']
e_p_b_one = e_p_b_one.groupby(e_p_b_one.TimeCreate).size().reset_index()

In [14]:
e_p_b_two = cfs_df[cfs_df.Beat=='5G02']
e_p_b_two = e_p_b_two.groupby(e_p_b_two.TimeCreate).size().reset_index()

In [15]:
sns_path = "../Datasets/Beat_Data/stop_search_beat.csv"
s_n_s_df =  pd.read_csv(sns_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
s_n_s_df.columns

Index([u'FieldInterviewID', u'NOPD_Item', u'EventDate', u'District', u'Zone',
       u'OfficerAssignment', u'StopDescription', u'ActionsTaken',
       u'VehicleYear', u'VehicleMake', u'VehicleModel', u'VehicleStyle',
       u'VehicleColor', u'SubjectID', u'SubjectRace', u'SubjectGender',
       u'SubjectAge', u'SubjectHasPhotoID', u'SubjectHeight', u'SubjectWeight',
       u'SubjectEyeColor', u'SubjectHairColor', u'SubjectDriverLicState',
       u'CreatedDateTime', u'LastModifiedDateTime', u'Longitude', u'Latitude',
       u'Zip', u'BlockAddress', u'Date', u'Beat'],
      dtype='object')

Now lets get the zips for which the searches are made. 70119 and 70117

In [17]:
zip_1_sns =  s_n_s_df[s_n_s_df.Zip==70117]
zip_2_sns  = s_n_s_df[s_n_s_df.Zip==70119]
zip_1_sns = zip_1_sns.groupby(zip_1_sns.Date).size().reset_index()
zip_2_sns = zip_2_sns.groupby(zip_2_sns.Date).size().reset_index()

get the stops count for top two beats

In [18]:
s_n_s_epb_one = s_n_s_df[s_n_s_df.Beat=='5K02']
s_n_s_epb_one = s_n_s_epb_one.groupby(s_n_s_epb_one.Date).size().reset_index()
s_n_s_epb_two = s_n_s_df[s_n_s_df.Beat=='5G02']
s_n_s_epb_two = s_n_s_epb_two.groupby(s_n_s_epb_two.Date).size().reset_index()

In [19]:
s_n_s_epb_one.rename(columns = {0:"Count"},inplace=True)

In [20]:
s_n_s_epb_two.rename(columns = {0:"Count"},inplace=True)

In [21]:
s_n_s_epb_one.Date = s_n_s_epb_one.Date.str.replace('-',"")

In [22]:
s_n_s_epb_one.Date = pd.to_datetime(s_n_s_epb_one.Date,format="%Y%m%d")

In [23]:
s_n_s_epb_two.Date = s_n_s_epb_two.Date.str.replace('-',"")
s_n_s_epb_two.Date = pd.to_datetime(s_n_s_epb_two.Date,format="%Y%m%d")

In [24]:
e_p_b_one.rename(columns = {0:"Count"},inplace=True)

In [25]:
e_p_b_two.rename(columns = {0:"Count"},inplace=True)

In [26]:
zip_1_df.rename(columns = {0:"Count"},inplace=True)
zip_2_df.rename(columns = {0:"Count"},inplace=True)
zip_1_sns.rename(columns = {0:"Count"},inplace=True)
zip_2_sns.rename(columns = {0:"Count"},inplace=True)

In [27]:
zip_1_sns.Date = zip_1_sns.Date.str.replace('-',"")
zip_1_sns.Date = pd.to_datetime(zip_1_sns.Date,format="%Y%m%d")
zip_2_sns.Date = zip_2_sns.Date.str.replace('-',"")
zip_2_sns.Date = pd.to_datetime(zip_2_sns.Date,format="%Y%m%d")

In [28]:
e_p_b_one.TimeCreate = e_p_b_one.TimeCreate.apply(lambda x:x.date())

In [29]:
e_p_b_two.TimeCreate = e_p_b_two.TimeCreate.apply(lambda x:x.date())

In [30]:
zip_1_df.TimeCreate = zip_1_df.TimeCreate.apply(lambda x:x.date())
zip_2_df.TimeCreate = zip_2_df.TimeCreate.apply(lambda x:x.date())

In [None]:
e_p_b_one.index = e_p_b_one.TimeCreate
del e_p_b_one['TimeCreate']


In [None]:
e_p_b_two.index = e_p_b_two.TimeCreate
del e_p_b_two['TimeCreate']


In [None]:
s_n_s_epb_one.index = s_n_s_epb_one.Date
del s_n_s_epb_one['Date']


In [None]:
s_n_s_epb_two.index = s_n_s_epb_two.Date
del s_n_s_epb_two['Date']

In [37]:
zip_1_df.index = zip_1_df.TimeCreate
del zip_1_df['TimeCreate']
zip_2_df.index = zip_2_df.TimeCreate
del zip_2_df['TimeCreate']
zip_1_sns.index = zip_1_sns.Date
del zip_1_sns['Date']
zip_2_sns.index = zip_2_sns.Date
del zip_2_sns['Date']

In [38]:
e_p_b_one.loc[pd.to_datetime("20140101",format="%Y%m%d").date()]=[0]
e_p_b_one.loc[pd.to_datetime("20161231",format="%Y%m%d").date()]=[0]
e_p_b_two.loc[pd.to_datetime("20140101",format="%Y%m%d").date()]=[0]
e_p_b_two.loc[pd.to_datetime("20161231",format="%Y%m%d").date()]=[0]
s_n_s_epb_one.loc[pd.to_datetime("20140101",format="%Y%m%d").date()]=[0]
s_n_s_epb_one.loc[pd.to_datetime("20161231",format="%Y%m%d").date()]=[0]
s_n_s_epb_two.loc[pd.to_datetime("20140101",format="%Y%m%d").date()]=[0]
s_n_s_epb_two.loc[pd.to_datetime("20161231",format="%Y%m%d").date()]=[0]

Now totally there are 8 dataframes

1. zip_1_df  for zip code 70117 CFS
2. zip_2_df for zip code 70119 CFS
3. zip_1_sns for sns happened in 70117
4. zip_2_sns for sns happened in 70119
5. e_p_b_one gives cfs from beat area 5K02
6. e_p_b_two gives cfs from beat area 5G02
7. s_n_s_epb_one sns happened in beat area 5K02
8. s_n_s_epb_two sns happened in beat area 5G02

In [42]:
# adjusting to match start and end dates to 2014/01/01 and 2016/12/31
e_p_b_one.index = pd.to_datetime(e_p_b_one.index)
e_p_b_two.index = pd.to_datetime(e_p_b_two.index)
s_n_s_epb_one.index = pd.to_datetime(s_n_s_epb_one.index)
s_n_s_epb_two.index = pd.to_datetime(s_n_s_epb_two.index)
zip_1_df.index = pd.to_datetime(zip_1_df.index)
zip_2_df.index = pd.to_datetime(zip_2_df.index)
zip_1_sns.index =pd.to_datetime(zip_1_sns.index)
zip_2_sns.index = pd.to_datetime(zip_2_sns.index)

In [440]:
def stops_vs_crime(crimes_df,stops_df,crime_window,stops_window):
    
    
    total_crimes_in_window = crimes_df.resample(crime_window,closed='left').agg({'sum_crimes': np.sum, 'avg_crimes': np.mean, 'max_crimes':np.max}).fillna(0)
    total_stops_in_window =  stops_df.resample(stops_window,closed='left').agg({'sum_stops': np.sum, 'avg_stops': np.mean,'max_stops':np.max}).fillna(0)
    total_crimes_within_stop_window = crimes_df.resample(stops_window,closed='left').agg({'sum_window_crimes': np.sum, 'avg_window_crimes': np.mean,'max_window_crimes':np.max}).fillna(0)
    
    # for months
    

    if stops_window=='MS' and crime_window=='MS':
        total_crimes_in_window =  total_crimes_in_window[1:]
    if stops_window=='MS' and crime_window=='14D':
        total_crimes_in_window =total_crimes_in_window[2::2]
    if stops_window=='MS' and crime_window=='7D':
        total_crimes_in_window =  total_crimes_in_window[4::4]
    if stops_window=='MS' and crime_window=='D':
        total_crimes_in_window  = total_crimes_in_window[total_crimes_in_window.index.day==1]
    
    
    # for weeks
    if stops_window=='14D' and crime_window=='14D':
        total_crimes_in_window= total_crimes_in_window[1::]
    
    if stops_window=='14D' and crime_window=='7D':
        total_crimes_in_window= total_crimes_in_window[2::2]
    
    if stops_window=='14D' and crime_window=='D':
        total_crimes_in_window = total_crimes_in_window[::14]
    
    # for 7D and D
    
    if stops_window=='7D' and crime_window=='D':
        
        total_crimes_in_window = total_crimes_in_window[::7]
        
#     print total_crimes_within_stop_window.head()
    total_stops_and_crimes_in_window =pd.concat([total_stops_in_window, total_crimes_within_stop_window], axis=1, join='inner')
    
    return total_crimes_in_window,total_stops_and_crimes_in_window
    
    

In [413]:
crimes_in_window, stops_window = stops_vs_crime(zip_1_df,zip_1_sns,'MS','MS')


In [None]:
crimes_in_window.

In [396]:
from matplotlib import pyplot
% %matplotlib inline

import plotly
plotly.tools.set_credentials_file(username='karthikb', api_key='ubvM66f4apRzPA049z05')
import plotly.graph_objs as go


a_df = zip_1_df.groupby(zip_1_df.index).size().reset_index()



# Crime trends in 70117

In [398]:
a_df.index = a_df.TimeCreate
a_df.rename(columns={0:"Crime Count"},inplace=True)
# a_df[(a_df.index.month==5)| ]


In [400]:
data = [go.Scatter(x=a_df.index,y=a_df["Crime Count"])]
plotly.plotly.iplot(data)

# Impact of Stops on crimes within the its window in 70117

In [418]:
stops_window.head()
# stops_per_crime_per_month =  stops_window.ix[:,'sum_stops']/stops_window.ix[:,'sum_window_crimes']
# stops_window.ix[:,stops_window['sum_stops']].plot(secondary_y=True,style='g')

Unnamed: 0_level_0,max_stops,sum_stops,avg_stops,max_window_crimes,avg_window_crimes,sum_window_crimes
Unnamed: 0_level_1,Count,Count,Count,Count,Count,Count
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2014-01-01,30,420,13.548387,1,1,23
2014-02-01,22,324,11.571429,1,1,20
2014-03-01,18,276,8.903226,1,1,28
2014-04-01,22,210,7.0,1,1,21
2014-05-01,20,233,7.516129,1,1,30


In [426]:
# Learn about API authentication here: https://plot.ly/python/getting-started
# Find your api_key here: https://plot.ly/settings/api
def window_graph(stops_df):
    
    trace1 = go.Scatter(
        x=stops_df.index,
        y=list(stops_df.ix[:,'sum_stops'].Count),
        name='Stops in a month window'
    )
    trace2 = go.Scatter(
        x=stops_df.index,
        y=list(stops_df.ix[:,'sum_window_crimes'].Count),
        name='Crimes in a month window',
        yaxis='y2'
    )
    data = [trace1, trace2]
    layout = go.Layout(
        title='Stops vs Crimes',
        yaxis=dict(
            title='Number of Stops'
        ),
        yaxis2=dict(
            title='Number of crimes',
            titlefont=dict(
                color='rgb(148, 103, 189)'
            ),
            tickfont=dict(
                color='rgb(148, 103, 189)'
            ),
            overlaying='y',
            side='right'
        )
    )
    fig = go.Figure(data=data, layout=layout)
    return fig


In [427]:
fig  = window_graph(stops_df=stops_window)
plotly.plotly.iplot(fig)

# Impact of Stops on Future crime.

A correlation graph between the crimes happened in the current window month and the previous window month 

In [433]:
crime_month_window, stops_month_window = stops_vs_crime(zip_1_df,zip_1_sns,'MS','MS')
crime_two_week_window, stops_month_window = stops_vs_crime(zip_1_df,zip_1_sns,'14D','MS')
crime_week_window, stops_month_window = stops_vs_crime(zip_1_df,zip_1_sns,'7D','MS')
crime_day_window,stops_month_window = stops_vs_crime(zip_1_df,zip_1_sns,'D','MS')

In [437]:
avg_stops_window_crimes =  int(np.average(stops_month_window['sum_window_crimes']))
avg_monthly_crimes = int(np.average(crime_month_window['sum_crimes']))
avg_bi_weekly_crimes = int(np.average(crime_two_week_window['sum_crimes']))
avg_weekly_crimes = int(np.average(crime_week_window['sum_crimes']))
avg_daily_crimes = int(np.average(crime_day_window['sum_crimes']))

In [439]:
trace1 = go.Bar(
    x=['M vs M', 'M vs 2W', 'M vs W','M vs D'],
    y=[avg_stops_window_crimes]*4,
    name='Stops and Search Period',
    error_y=dict(
        type='data',
        array=[1, 0.5, 1.5],
        visible=True
    )
)

trace2 = go.Bar(
    x=['M vs M', 'M vs 2W', 'M vs W','M vs D'],
    y=[avg_monthly_crimes,avg_bi_weekly_crimes,avg_weekly_crimes,avg_daily_crimes],
    name='After Stops and Search',
    error_y=dict(
        type='data',
        array=[0.5, 1, 2],
        visible=True
    )
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig, filename='error-bar-bar')


In [441]:
crime_tweek_tweek_window, stops_two_week_window = stops_vs_crime(zip_1_df,zip_1_sns,'14D','14D')
crime_week_tweek_window, stops_two_week_window = stops_vs_crime(zip_1_df,zip_1_sns,'7D','14D')
crime_day_tweek_window, stops_two_week_window = stops_vs_crime(zip_1_df,zip_1_sns,'D','14D')

In [442]:
avg_tweek_stops_window_crimes =  int(np.average(stops_two_week_window['sum_window_crimes']))
avg_tweek_tweek_crimes = int(np.average(crime_tweek_tweek_window['sum_crimes']))
avg_week_tweek_crimes = int(np.average(crime_week_tweek_window['sum_crimes']))
avg_datily_tweek_crimes = int(np.average(crime_day_tweek_window['sum_crimes']))


In [443]:
trace1 = go.Bar(
    x=['2W vs 2W', '2W vs W', '2W vs D'],
    y=[avg_tweek_stops_window_crimes]*3,
    name='Stops and Search Period',
    error_y=dict(
        type='data',
        array=[1, 0.5, 1.5],
        visible=True
    )
)

trace2 = go.Bar(
    x=['2W vs 2W', '2W vs W', '2W vs D'],
    y=[avg_tweek_tweek_crimes,avg_week_tweek_crimes,avg_datily_tweek_crimes],
    name='After Stops and Search',
    error_y=dict(
        type='data',
        array=[0.5, 1, 2],
        visible=True
    )
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig, filename='error-bar-bar')


In [448]:
trace1 = go.Scatter(
        x= stops_two_week_window.index,
        y=list(stops_two_week_window.ix[:,'sum_stops'].Count),
        name='Stops in a 2 week window'
    )
trace2 = go.Scatter(
        x=stops_two_week_window.index,
        y=list(stops_two_week_window.ix[:,'sum_window_crimes'].Count),
        name='Crimes in a 2 week window',
        yaxis='y2'
    )
data = [trace1, trace2]
layout = go.Layout(
        title='Stops vs Crimes 2 Weeks',
        yaxis=dict(
        title='Number of Stops'
        ),
        yaxis2=dict(
        title='Number of crimes',
        titlefont=dict(
        color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
        color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig)

In [375]:
m_s_df = stops_per_crime_per_month.Count.groupby(stops_per_crime_per_month.index.month).mean().reset_index()
m_s_df['month'] = ['Jan','Feb','Mar','Apr','May','Jun','July','Aug','Sep','Oct','Nov','Dec']
m_s_df.index = m_s_df.month
del m_s_df['month']

In [429]:
# m_s_df.ix[:,m_s_df.Count].plot()





data = [go.Bar(
            y = list(m_s_df.Count),
            x = list(m_s_df.index),
            name='Stops in a month window'
    )]

layout = go.Layout(
        title='Average Number of Stops per crime',
        yaxis=dict(
            title='Number of Stops'
        )
    )

plotly.plotly.iplot(data,layout=layout)

# 70119 

In [449]:
b_df.index = b_df.TimeCreate
b_df.rename(columns={0:"Crime Count in 70119"},inplace=True)
# a_df[(a_df.index.month==5)| ]


In [450]:
data = [go.Scatter(x=b_df.index,y=b_df["Crime Count in 70119"])]
plotly.plotly.iplot(data)

In [451]:
crime_month_window_, stops_month_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'MS','MS')
crime_two_week_window_, stops_month_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'14D','MS')
crime_week_window_, stops_month_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'7D','MS')
crime_day_window_,stops_month_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'D','MS')

In [452]:
avg_stops_window_crimes_ =  int(np.average(stops_month_window_['sum_window_crimes']))
avg_monthly_crimes_ = int(np.average(crime_month_window_['sum_crimes']))
avg_bi_weekly_crimes_ = int(np.average(crime_two_week_window_['sum_crimes']))
avg_weekly_crimes_ = int(np.average(crime_week_window_['sum_crimes']))
avg_daily_crimes_ = int(np.average(crime_day_window_['sum_crimes']))

In [453]:
trace1 = go.Scatter(
        x=stops_month_window_.index,
        y=list(stops_month_window_.ix[:,'sum_stops'].Count),
        name='Stops in a month window'
    )
trace2 = go.Scatter(
    x=stops_month_window_.index,
    y=list(stops_month_window_.ix[:,'sum_window_crimes'].Count),
    name='Crimes in a month window',
    yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
    title='Stops vs Crimes',
    yaxis=dict(
    title='Number of Stops'
    ),
yaxis2=dict(
    title='Number of crimes',
    titlefont=dict(
        color='rgb(148, 103, 189)'
        ),
    tickfont=dict(
    color='rgb(148, 103, 189)'
    ),
    overlaying='y',
    side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig)

In [454]:
trace1 = go.Bar(
    x=['M vs M', 'M vs 2W', 'M vs W','M vs D'],
    y=[avg_stops_window_crimes_]*4,
    name='Stops and Search Period',
    error_y=dict(
        type='data',
        array=[1, 0.5, 1.5],
        visible=True
    )
)

trace2 = go.Bar(
    x=['M vs M', 'M vs 2W', 'M vs W','M vs D'],
    y=[avg_monthly_crimes_,avg_bi_weekly_crimes_,avg_weekly_crimes_,avg_daily_crimes_],
    name='After Stops and Search',
    error_y=dict(
        type='data',
        array=[0.5, 1, 2],
        visible=True
    )
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig, filename='error-bar-bar')

In [455]:
crime_tweek_tweek_window_, stops_two_week_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'14D','14D')
crime_week_tweek_window_, stops_two_week_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'7D','14D')
crime_day_tweek_window_, stops_two_week_window_ = stops_vs_crime(zip_2_df,zip_2_sns,'D','14D')

In [456]:
avg_tweek_stops_window_crimes_ =  int(np.average(stops_two_week_window_['sum_window_crimes']))
avg_tweek_tweek_crimes_ = int(np.average(crime_tweek_tweek_window_['sum_crimes']))
avg_week_tweek_crimes_ = int(np.average(crime_week_tweek_window_['sum_crimes']))
avg_datily_tweek_crimes_ = int(np.average(crime_day_tweek_window_['sum_crimes']))

In [457]:
trace1 = go.Bar(
    x=['2W vs 2W', '2W vs W', '2W vs D'],
    y=[avg_tweek_stops_window_crimes_]*3,
    name='Stops and Search Period',
    error_y=dict(
        type='data',
        array=[1, 0.5, 1.5],
        visible=True
    )
)

trace2 = go.Bar(
    x=['2W vs 2W', '2W vs W', '2W vs D'],
    y=[avg_tweek_tweek_crimes_,avg_week_tweek_crimes_,avg_datily_tweek_crimes_],
    name='After Stops and Search',
    error_y=dict(
        type='data',
        array=[0.5, 1, 2],
        visible=True
    )
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig, filename='error-bar-bar')

In [459]:
trace1 = go.Scatter(
        x= stops_two_week_window_.index,
        y=list(stops_two_week_window_.ix[:,'sum_stops'].Count),
        name='Stops in a 2 week window'
    )
trace2 = go.Scatter(
        x=stops_two_week_window_.index,
        y=list(stops_two_week_window_.ix[:,'sum_window_crimes'].Count),
        name='Crimes in a 2 week window',
        yaxis='y2'
    )
data = [trace1, trace2]
layout = go.Layout(
        title='Stops vs Crimes 2 Weeks',
        yaxis=dict(
        title='Number of Stops'
        ),
        yaxis2=dict(
        title='Number of crimes',
        titlefont=dict(
        color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
        color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
        )
    )
fig = go.Figure(data=data, layout=layout)
plotly.plotly.iplot(fig)

In [467]:
stops_per_crime_per_month_ = stops_month_window_["sum_stops"].Count/stops_month_window_["sum_window_crimes"].Count

In [474]:

m_s_df_ = stops_per_crime_per_month_.groupby(stops_per_crime_per_month_.index.month).mean().reset_index()
m_s_df_['month'] = ['Jan','Feb','Mar','Apr','May','Jun','July','Aug','Sep','Oct','Nov','Dec']
m_s_df_.index = m_s_df_.month
del m_s_df_['month']

In [476]:
data = [go.Bar(
            y = list(m_s_df_.Count),
            x = list(m_s_df_.index),
            name='Stops in a month window'
    )]

layout = go.Layout(
        title='Average Number of Stops per crime',
        yaxis=dict(
            title='Number of Stops'
        )
    )

plotly.plotly.iplot(data,layout=layout)

# Get Beat for Stops and Search

# Merging with Stop and Search

In [None]:
def countCrimeStopInTimeWindow(crimeData, stopData, timeFrame, timeWindow):
    # timeFrame - timeWindow
    # month - month / 2 weeks / week
    # 2 weeks - 2 weeks / week / day
    # week - week / day
    stopDate = []
    crimeSum = []
    stopSum = []
    
    before = []
    during = []
    after = []
    
    if timeFrame == "month":
        if timeWindow == "month":
            window = 30
        elif timeWindow == "2 weeks":
            window = 15
        elif timeWindow == "week":
            window = 7
        for year in range(2014, 2017):
            for month in range(1, 13):
                stopStart = stopData.index.searchsorted(dt.datetime(year, month, 1))
                
                crimeStart = stopStart
                
                if month in [1, 3, 5, 7, 8, 10, 12]:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 31))
                elif month in [4, 6, 9, 11]:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 30))
                else:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 28))
                crimeEnd = stopEnd + window
                stopDate.append(str(year) + "-" + str(month))
                stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                
    
    if timeFrame == "2 weeks":
        if timeWindow == "2 weeks":
            window = 15
        elif timeWindow == "week":
            window = 7
        elif timeWindow == "day":
            window = 1
        for year in range(2014, 2017):
            for month in range(1, 13):
                for half in [1, 2]:
                    stopStart = stopData.index.searchsorted(dt.datetime(year, month, 1 + 15 * (half - 1)))
                    crimeStart = stopStart
                    if half == 1:
                        stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 15))
                    else:
                        if month in [1, 3, 5, 7, 8, 10, 12]:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 31))
                        elif month in [4, 6, 9, 11]:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 30))
                        else:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 28))
                    crimeEnd = stopEnd + window
                    stopDate.append(str(year) + "-" + str(month) + "-" + str(half))
                    stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                    crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                    
                    
    if timeFrame == "week":
        numWeek = int(len(crimeData) / 7)
        if timeWindow == "week":
            window = 7
        if timeWindow == "day":
            window = 1
        for year in range(2012, 2017):
            for week in range(numWeek):
                stopStart = stopData.index.searchsorted(dt.datetime(2012, 1, 2)) + 7 * (week - 1)
                crimeStart = stopStart
                stopEnd = stopStart + 7
                crimeEnd = stopEnd + window
                stopDate.append(str(year) + "-" + str(week))
                stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                               
    return crimeSum, stopSum

In [None]:
# fill seaborn scatter here
a,b = countCrimeStopInTimeWindow(e_p_b_one,s_n_s_epb_one,"month","month")


In [None]:
def get_all_correlations(crimes_count,stops_count,beat):
    
    month_month = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "month")
    month_twoWeeks = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "2 weeks")
    month_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "week")
    twoWeeks_twoWeeks = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "2 weeks")
    twoWeeks_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "week")
    twoWeeks_day = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "week")
    week_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "week", "week")
    week_day = countCrimeStopInTimeWindow(crimes_count, stops_count, "week", "day")
    
    
    
    
#     fig['layout'].update(height=600, width=600,
#                      title='Calls For Service')
    
#     py.iplot(fig, filename='Calls For Service')
# #     data = [trace1,trace2, trace3,trace4,trace5,trace6,trace7,trace8]
    
    
    # the plotting goes here
    
    mm_cor = np.corrcoef(month_month[0], month_month[1])[0, 1]
    mtw_cor = np.corrcoef(month_twoWeeks[0], month_twoWeeks[1])[0, 1]
    mw_cor = np.corrcoef(month_week[0], month_week[1])[0, 1]
    twtw_cor = np.corrcoef(twoWeeks_twoWeeks[0], twoWeeks_twoWeeks[1])[0, 1]
    tww_cor = np.corrcoef(twoWeeks_week[0], twoWeeks_week[1])[0, 1]
    twd_cor = np.corrcoef(twoWeeks_day[0], twoWeeks_day[1])[0, 1]
    ww_cor = np.corrcoef(week_week[0], week_week[1])[0, 1]
    wwd_cor = np.corrcoef(week_day[0], week_day[1])[0, 1]
    
    dict =  {"Beat":beat,"mm_cor":mm_cor, "mtw_cor":mtw_cor,"mw_cor":mw_cor,"twtw_cor":twtw_cor,"tww_cor":tww_cor,"twd_cor":twd_cor,"ww_cor":ww_cor,"wwd_cor":wwd_cor}
    return pd.DataFrame([dict],columns=dict.keys())

In [None]:
# print(mm_cor, mtw_cor, mw_cor, twtw_cor, tww_cor, twd_cor, ww_cor, wwd_cor)

In [None]:
def correlation_for_each_zip(crime_count,stop_count):
    unique_Beats = crime_count['Beat'].unique()
    final_df = pd.DataFrame()
    for i in unique_Beats:
        Beat_i_crimes = crime_sum[crime_sum.Beat==i]
        Beat_i_crimes = crime_sum[['Date','count']]
        # gives some number of rows with 2 columns
        Beat_i_crimes = Beat_i_crimes.set_index(['Date'])
        Beat_i_stops = stops_sum[stops_sum.Beat==i]
        Beat_i_stops = Beat_i_stops[['Date','count']]
        Beat_i_stops = Beat_i_stops.set_index(['Date'])
        # gives some number of rows with 2 columns
        idf =  get_all_correlations(Beat_i_crimes,Beat_i_stops,i)
        final_df = final_df.append(idf,ignore_index=True)
    
    
    print len(final_df)
        
        
        
        

In [None]:
a = dt.datetime(2014, 07, 31)

In [None]:
a

In [None]:
def aggregateDateByTimeWin(stops, crime, startDate, endDate):
    before = []
    during = []
    after = []
#     for i in range(len(indicator)):
#         duration = (stops[endDate].iloc[i] - indicator[startDate].iloc[i]).days
    lastStart = crime.index.searchsorted(crime.index[i] - dt.timedelta(days= duration))
    lastEnd = crime.index.searchsorted(stops[startDate].iloc[i])
    currStart = crime.index.searchsorted(stops[startDate].iloc[i])
    currEnd = crime.index.searchsorted(stops[endDate].iloc[i])
    nextStart = crime.index.searchsorted(stops[endDate].iloc[i])
    nextEnd = crime.index.searchsorted(stops[endDate].iloc[i] + dt.timedelta(days = duration))
    beforeNum = before.append(int(sum(crime.ix[lastStart:lastStart].values)))
    duringNum = during.append(int(sum(crime.ix[currStart:currEnd].values)))
    afterNum = after.append(int(sum(crime.ix[nextStart:nextEnd].values)))
    return (before, during, after)