In [175]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

In [61]:
# This cell was ran once and the data was saved into a csv
# results_ml = client.get("wg3w-h783", limit=500000, order="incident_date DESC")
# df = pd.DataFrame.from_records(results_ml)
# df.to_csv("sfpd_reports.csv", index=False)

In [62]:
df = pd.read_csv("sfpd_reports.csv")

First we make remove any rows with missing data:

In [63]:
df['incident_datetime'] = pd.to_datetime(df['incident_datetime'])
df = df[~df['incident_category'].isnull()]
df = df[~df['latitude'].isnull()]
df = df[~df['police_district'].isnull()]
df = df[["incident_datetime", "incident_day_of_week", "police_district", "incident_category"]]

We want to predict the daily crime rate so we group the response by date. In other words, we resample the dataframe into one day bins.

In [64]:
group_by_pd = df.copy()
group_by_pd["Count"] = 1
group_by_pd = group_by_pd.set_index(group_by_pd["incident_datetime"]).drop(columns=["incident_datetime"])
group_by_pd = group_by_pd.groupby(['police_district']).resample('1D').sum()
group_by_pd = group_by_pd.reset_index()

In [65]:
X = group_by_pd.copy()

## Feature Engineering

Here, we one-hot encode the police district feature and create a feature for both the day and month:

In [66]:
X = pd.concat([X, pd.get_dummies(X["police_district"])], axis=1).drop(columns=["police_district"])

In [67]:
# X['year'] = pd.to_datetime(X['incident_datetime']).dt.year
X['month'] = pd.to_datetime(X['incident_datetime']).dt.month
X['day'] = pd.to_datetime(X['incident_datetime']).dt.day

Given that the latest incidents will be approved later in the future we cannot use the last days as training data.

In [68]:
X = X.iloc[:-10]

In [69]:
y = X[["Count"]]
X = X.drop(['incident_datetime', "Count"], axis=1)

In [70]:
X.shape

(12250, 13)

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

In [73]:
X_train

Unnamed: 0,Bayview,Central,Ingleside,Mission,Northern,Out of SF,Park,Richmond,Southern,Taraval,Tenderloin,month,day
4738,0,0,0,0,1,0,0,0,0,0,0,10,6
10664,0,0,0,0,0,0,0,0,0,1,0,9,27
6286,0,0,0,0,0,1,0,0,0,0,0,12,13
4785,0,0,0,0,1,0,0,0,0,0,0,11,22
9858,0,0,0,0,0,0,0,0,1,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,0,0,0,0,0,0,0,0,0,0,1,3,30
5191,0,0,0,0,1,0,0,0,0,0,0,1,2
5390,0,0,0,0,1,0,0,0,0,0,0,7,19
860,1,0,0,0,0,0,0,0,0,0,0,5,10


## Training a DT Regressor

Here we use the scikit-learn implementation to train a DT model.

In [74]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [203]:
scoring = {'MAE': 'neg_mean_absolute_error', 'MSE': 'neg_mean_squared_error'}

In [204]:
dtr = DecisionTreeRegressor()


# Parameters of pipelines can be set using ‘__’ separated parameter names:
parameters_dt = {
    'max_depth':np.linspace(1, 20, 20).astype(int), 
    'min_samples_split':[2, 3, 4]
}


search_dt = GridSearchCV(dtr, parameters_dt, scoring=scoring, refit='MAE', n_jobs=-1, cv=None, return_train_score=True)
search_dt.fit(X_train, y_train)
# search_dt.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search_dt.best_score_)
print(search_dt.best_params_)

Best parameter (CV score=-7.846):
{'max_depth': 9, 'min_samples_split': 4}


In [205]:
DT_RESULTS = pd.DataFrame(search_dt.cv_results_)

In [206]:
mae_train = mean_absolute_error(y_train, search_dt.predict(X_train))
mae_test = mean_absolute_error(y_test, search_dt.predict(X_test))
print("Train MAE: {}".format(mae_train))
print("Test MAE: {}\n".format(mae_test))

Train MAE: 7.453653117176461
Test MAE: 7.862959066077534



## Train a SVM Model

In [84]:
from sklearn.svm import SVR

In [170]:
svr = SVR()
std = StandardScaler()

pipe_svr = Pipeline(steps=[('std', std), ('svm', svr)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
parameters_svr = {
    'svm__kernel':('poly', 'rbf'), 
    'svm__C':[100], 
    "svm__coef0":[3], 
    'svm__degree':[3]}


search_svr = GridSearchCV(pipe_svr, parameters_svr, scoring=scoring, refit='MAE', n_jobs=-1, cv=None, return_train_score=True)
search_svr.fit(X_train, y_train)
# search_svr.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search_svr.best_score_)
print(search_svr.best_params_)

  return f(**kwargs)


Best parameter (CV score=-7.637):
{'svm__C': 100, 'svm__coef0': 3, 'svm__degree': 3, 'svm__kernel': 'poly'}


In [171]:
mae_train = mean_absolute_error(y_train, search_svr.predict(X_train))
mae_test = mean_absolute_error(y_test, search_svr.predict(X_test))
print("Train MAE: {}".format(mae_train))
print("Test MAE: {}\n".format(mae_test))

Train MAE: 7.544628862921769
Test MAE: 7.666044208319964



In [173]:
svr = SVR()
std = StandardScaler()

pipe_svr_all = Pipeline(steps=[('std', std), ('svm', svr)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
parameters_svr_all = {
    'svm__kernel':('poly', 'rbf'), 
    'svm__C':[1, 50, 100], 
    "svm__coef0":[2, 3, 4, 5], 
    'svm__degree':[2, 3, 4]}


search_svr_all = GridSearchCV(pipe_svr_all, parameters_svr_all, scoring=scoring, refit='MAE', n_jobs=-1, cv=None, return_train_score=True)
search_svr_all.fit(X_train, y_train)
# search_svr_all.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search_svr_all.best_score_)
print(search_svr_all.best_params_)

  return f(**kwargs)


Best parameter (CV score=-7.633):
{'svm__C': 1, 'svm__coef0': 4, 'svm__degree': 4, 'svm__kernel': 'poly'}


In [174]:
mae_train = mean_absolute_error(y_train, search_svr_all.predict(X_train))
mae_test = mean_absolute_error(y_test, search_svr_all.predict(X_test))
print("Train MAE: {}".format(mae_train))
print("Test MAE: {}\n".format(mae_test))

Train MAE: 7.53641311396518
Test MAE: 7.674502378219805



## Training a KNN Model

In [158]:
from sklearn import neighbors

In [159]:
knn = neighbors.KNeighborsRegressor()
std = StandardScaler()

pipe_knn = Pipeline(steps=[('std', std), ('knn', knn)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
parameters_knn = {
    'knn__weights':('uniform', 'distance'), 
    'knn__n_neighbors':np.linspace(1, 20, 20).astype(int), 
    "knn__p":[1, 2]
}


search_knn = GridSearchCV(pipe_knn, parameters_knn, scoring=scoring, refit='MAE', n_jobs=-1, cv=None, return_train_score=True)
# search_knn.fit(X_train, y_train)
search_knn.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search_knn.best_score_)
print(search_knn.best_params_)

Best parameter (CV score=-14.148):
{'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'uniform'}


In [160]:
mae_train = mean_absolute_error(y_train, search_knn.predict(X_train))
mae_test = mean_absolute_error(y_test, search_knn.predict(X_test))
print("Train MAE: {}".format(mae_train))
print("Test MAE: {}\n".format(mae_test))

Train MAE: 7.373072886297375
Test MAE: 7.451795918367346



In [209]:
# MAX_DEPTH = []
# TRAIN_MAE = []
# TEST_MAE = []
# MSS = []

# for i in np.linspace(1, 20, 20).astype(int):
#     for mss in [2, 3, 4]:
#         regr_1 = DecisionTreeRegressor(max_depth=i, min_samples_split=mss)
#         regr_1.fit(X_train, y_train)
#         mae_train = mean_absolute_error(y_train, regr_1.predict(X_train))
#         mae_test = mean_absolute_error(y_test, regr_1.predict(X_test))
#         MAX_DEPTH.append(i)
#         MSS.append(mss)
#         TRAIN_MAE.append(mae_train)
#         TEST_MAE.append(mae_test)

# RESULTS_DF = pd.DataFrame({"MaxDepth":MAX_DEPTH, "MSS":MSS, "TrainMAE":TRAIN_MAE, "TestMAE":TEST_MAE})

# RESULTS_DF.sort_values(by="TrainMAE").head()

# RESULTS_DF.sort_values(by="TestMAE").head()

# svr_rbf = SVR(kernel='rbf', C=100, gamma='auto')
# svr_rbf_pipeline = make_pipeline(StandardScaler(), svr_rbf)

# svr_rbf_pipeline.fit(X_train, y_train)

# mae_train = mean_absolute_error(y_train, svr_rbf_pipeline.predict(X_train))
# mae_test = mean_absolute_error(y_test, svr_rbf_pipeline.predict(X_test))
# print("Train MAE: {}".format(mae_train))
# print("Test MAE: {}\n".format(mae_test))

# svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, coef0=3)
# svr_poly_pipeline = make_pipeline(StandardScaler(), svr_poly)

# svr_poly_pipeline.fit(X_train, y_train)

# mae_train = mean_absolute_error(y_train, svr_poly_pipeline.predict(X_train))
# mae_test = mean_absolute_error(y_test, svr_poly_pipeline.predict(X_test))
# print("Train MAE: {}".format(mae_train))
# print("Test MAE: {}\n".format(mae_test))

# knn = neighbors.KNeighborsRegressor()
# std = StandardScaler()

# pipe_knn_comp = Pipeline(steps=[('std', std), ('knn', knn)])

# # Parameters of pipelines can be set using ‘__’ separated parameter names:
# parameters_knn_comp = {
#     'knn__weights':('uniform', 'distance'), 
#     'knn__n_neighbors':np.linspace(1, 20, 20).astype(int), 
# }


# search_knn_comp = GridSearchCV(pipe_knn_comp, parameters_knn_comp, scoring=scoring, refit='MAE', n_jobs=-1, cv=None, return_train_score=True)
# search_knn_comp.fit(X_train, y_train)
# print("Best parameter (CV score=%0.3f):" % search_knn_comp.best_score_)
# print(search_knn_comp.best_params_)

# mae_train = mean_absolute_error(y_train, search_knn_comp.predict(X_train))
# mae_test = mean_absolute_error(y_test, search_knn_comp.predict(X_test))
# print("Train MAE: {}".format(mae_train))
# print("Test MAE: {}\n".format(mae_test))

# K_NUMBER = []
# WEIGHTS = []
# TRAIN_MAE_KNN = []
# TEST_MAE_KNN = []

# for weights in ["uniform", "distance"]:
#     for n_neighbors in range(1, 20):
#         knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
#         knn.fit(X_train, y_train)
        
#         mae_train = mean_absolute_error(y_train, knn.predict(X_train))
#         mae_test = mean_absolute_error(y_test, knn.predict(X_test))
#         K_NUMBER.append(n_neighbors)
#         WEIGHTS.append(weights)
#         TRAIN_MAE_KNN.append(mae_train)
#         TEST_MAE_KNN.append(mae_test)

# RESULTS_KNN = pd.DataFrame({"KNumber":K_NUMBER, "Weights":WEIGHTS, "TrainMAE":TRAIN_MAE_KNN, "TestMAE":TEST_MAE_KNN})

# RESULTS_KNN.sort_values(by="TrainMAE").head()

# RESULTS_KNN.sort_values(by="TestMAE").head()