## Placeholder

In [153]:
# Imports
from os import path
from scipy import stats
from sklearn.linear_model import LinearRegression
from statsmodels.compat import lzip
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

## Load the Frames

In [154]:
# Files
fremont_bikes = pd.read_csv('fremont_bikes.csv')
fremont_calls = pd.read_csv('fremont_calls.csv')
greenway_bikes = pd.read_csv('greenway_bikes.csv')
greenway_calls = pd.read_csv('greenway_calls.csv')

## Placeholder

In [159]:
# unique_final_call_types = fremont_calls['final_call_type'].unique()
# print(unique_final_call_types)

In [162]:
fremont_calls['call_time'] = pd.to_datetime(fremont_calls['time_queued'])
fremont_calls['arrived_time'] = pd.to_datetime(fremont_calls['arrived_time'])
fremont_calls['hour'] = fremont_calls['arrived_time'].dt.hour
fremont_bikes['hour'] = pd.to_datetime(fremont_bikes['time']).dt.hour
greenway_calls['call_time'] = pd.to_datetime(greenway_calls['time_queued'])
greenway_calls['arrived_time'] = pd.to_datetime(greenway_calls['arrived_time'])
greenway_calls['hour'] = greenway_calls['arrived_time'].dt.hour
greenway_bikes['hour'] = pd.to_datetime(greenway_bikes['time']).dt.hour

In [163]:
violent_crime_keywords = ['HOMICIDE', 'ASSAULT', 'ROBBERY', 'RAPE', 'KIDNAP', 'SHOTS', 'FIGHT - PHYSICAL', 'ARMED', 'WEAPON, PERSON WITH - GUN', 'THREATS', 'BOMB', 'CARJACKING', 'PERSON SHOT OR SHOT AT']

fremont_calls['is_violent_crime'] = fremont_calls['final_call_type'].str.contains('|'.join(violent_crime_keywords), case=False)
greenway_calls['is_violent_crime'] = greenway_calls['final_call_type'].str.contains('|'.join(violent_crime_keywords), case=False)

fremont_calls['is_regular_crime'] = 1 - fremont_calls['is_violent_crime']
greenway_calls['is_regular_crime'] = 1 - greenway_calls['is_violent_crime']

In [164]:
police_calls_summary = fremont_calls.groupby('hour').agg({
    'cad_num': 'count',  # Amount of police calls
    'final_call_type': lambda x: x.value_counts().idxmax(),  # Most common police response
    'is_violent_crime': 'sum',  # Number of violent crime responses
    'is_regular_crime': 'sum'  # Number of regular crime responses
}).reset_index()

police_calls_summary.rename(columns={
    'cad_num': 'no_of_calls',
    'final_call_type': 'most_common_type',
    'is_violent_crime': 'no_of_violent_crimes',
    'is_regular_crime': 'no_of_regular_crimes'
}, inplace=True)

In [165]:
bike_traffic_summary = fremont_bikes.groupby('hour').agg({
    'total': 'sum',  # Total bike traffic
}).reset_index()

bike_traffic_summary.rename(columns={'total': 'total_bike'}, inplace=True)

In [166]:
merged_data = pd.merge(police_calls_summary, bike_traffic_summary, on='hour')

correlation_police_calls_bike = merged_data['no_of_calls'].corr(merged_data['total_bike'])
correlation_bike_violent_crime = merged_data['no_of_violent_crimes'].corr(merged_data['total_bike'])
correlation_bike_regular_crime = merged_data['no_of_regular_crimes'].corr(merged_data['total_bike'])

In [167]:
print(merged_data) 

    hour  no_of_calls                          most_common_type  \
0      0        10462  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
1      1         9853  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
2      2         7911  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
3      3         4880  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
4      4        11623  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
5      5         8197  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
6      6         7855  --SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON   
7      7        12353                      -OFF DUTY EMPLOYMENT   
8      8        12583    --TRAFFIC - MV COLLISION INVESTIGATION   
9      9        14499              --TRAFFIC - MOVING VIOLATION   
10    10        14901              --TRAFFIC - MOVING VIOLATION   
11    11        13227              --TRAFFIC - MOVING VIOLATION   
12    12        22182              --TRAFFIC - MOVING VIOLATION   
13    13        18862              --TRAFFIC - MOVING VIOLATIO

In [168]:
print(f'Correlation between police calls and bike traffic: {correlation_police_calls_bike}')
print(f'Correlation between bike traffic and violent crime: {correlation_bike_violent_crime}')
print(f'Correlation between bike traffic and regular crime: {correlation_bike_regular_crime}')

Correlation between police calls and bike traffic: 0.37413954576263747
Correlation between bike traffic and violent crime: 0.32559956583172833
Correlation between bike traffic and regular crime: 0.3746957728911043


# What's Next

I think a good next step is to summarize the police call data by hour. We have hourly reports for the bike counters, so we could reduce the police call data into hourly reports as well so they can be compared.

This might be something like summarizing the amount of police calls in the hour, the most common type of police response in that hour, the percentage of violent crimes in that hour, etc. Then we can look for correlations. For example, we create a new data frame that has the amount of traffic related police responses per hour, and see if there's correlation between traffic responses and bike traffic.

## Ideas for hourly reports

* Most common type of police response in that hour (this would be a classification problem)
* Call time to response time delay (would have to parse the original time and arrival time, I just did arrival time in this notebook
* Number of violent crime responses in that hour
* Number of traffic related calls in that hour
* Number of calls overall in that area