In [2]:
# make sure to install this package before running:
#!pip install sodapy
#!pip install scikit-learn
#!pip install statsmodels
#!pip install folium
#!pip install imblearn

##if the libraries are not compatible, you might get an error --
#ensure compatible version of the two:
#!pip install -U scikit-learn imbalanced-learn

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
import folium
from folium.plugins import HeatMap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score,  confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE

# San Francisco Police Crime Data 
# Access the dataset via API

Currently works without a token. It queries only 2000 records. We need to connect with a token so we can query every night. This dataset is updated every night at 10pm.

In [2]:
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.sfgov.org", None)

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
SFC = client.get("wg3w-h783", limit=2000)

# Convert to pandas DataFrame
SFC_df = pd.DataFrame.from_records(SFC)



In [3]:
# Load the SF data
sf_file_path = 'data/SF_Crime_Reports.csv'
sf_data = pd.read_csv(sf_file_path)

In [4]:
sf_data

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,...,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts
0,2023/03/13 11:41:00 PM,2023/03/13,23:41,2023,Monday,2023/03/13 11:41:00 PM,125373607041,1253736,230167874,,...,,,,,,,,,,
1,2023/03/01 05:02:00 AM,2023/03/01,05:02,2023,Wednesday,2023/03/11 03:40:00 PM,125379506374,1253795,236046151,,...,,,,,,,,,,
2,2023/03/13 01:16:00 PM,2023/03/13,13:16,2023,Monday,2023/03/13 01:17:00 PM,125357107041,1253571,220343896,,...,,,,,,,,,,
3,2023/03/13 10:59:00 AM,2023/03/13,10:59,2023,Monday,2023/03/13 11:00:00 AM,125355107041,1253551,230174885,,...,,,,,,,,,,
4,2023/03/14 06:44:00 PM,2023/03/14,18:44,2023,Tuesday,2023/03/14 06:45:00 PM,125402407041,1254024,230176728,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797679,2023/03/22 01:30:00 PM,2023/03/22,13:30,2023,Wednesday,2023/03/22 01:35:00 PM,125685303401,1256853,230201949,230811847.0,...,-122.406336,POINT (-122.40633623830558 37.75300402962228),54.0,,,,3.0,,2.0,3.0
797680,2023/03/22 12:13:00 AM,2023/03/22,00:13,2023,Wednesday,2023/03/22 12:13:00 AM,125665206371,1256652,230200838,230810020.0,...,-122.440624,POINT (-122.44062361999508 37.78899525864335),102.0,,,,,,6.0,4.0
797681,2023/03/21 09:19:00 PM,2023/03/21,21:19,2023,Tuesday,2023/03/21 10:16:00 PM,125664272000,1256642,230200800,230803734.0,...,-122.452665,POINT (-122.45266472219633 37.70880633652071),66.0,,,,,,1.0,9.0
797682,2023/03/22 03:28:00 PM,2023/03/22,15:28,2023,Wednesday,2023/03/22 05:51:00 PM,125687306303,1256873,230202486,230812680.0,...,-122.405574,POINT (-122.40557374633903 37.790565236529574),19.0,,,,,,3.0,6.0


In [5]:
# Loading data
sf_data = sf_data[['Incident Date', 'Incident Time', 'Incident Year', 'Incident Day of Week', 
                   'Incident Category', 'Police District', 'Latitude', 'Longitude', 
                   'Neighborhoods', 'Resolution']].dropna()

# Create a copy and format 'Incident_Time' and extract 'Day' and 'Month'
df = sf_data.copy()
df[['Month', 'Day']] = df['Incident Date'].str.split('/', expand=True).iloc[:, 1:3].astype(int)
df['Incident Time'] = pd.to_datetime(df['Incident Time'], format='%H:%M')
df['Time_in_Minutes'] = df['Incident Time'].dt.hour * 60 + df['Incident Time'].dt.minute

# One-hot encode selected columns
one_hot_columns = ['Incident Day of Week', 'Incident Category', 'Police District', 'Resolution']
for col in one_hot_columns:
    df = df.join(pd.get_dummies(df[col], prefix=col).astype(int))


# Standardize column names
df.columns = df.columns.str.replace(' ', '_').str.replace('[?,()-]+', '', regex=True)

df


Unnamed: 0,Incident_Date,Incident_Time,Incident_Year,Incident_Day_of_Week,Incident_Category,Police_District,Latitude,Longitude,Neighborhoods,Resolution,...,Police_District_Out_of_SF,Police_District_Park,Police_District_Richmond,Police_District_Southern,Police_District_Taraval,Police_District_Tenderloin,Resolution_Cite_or_Arrest_Adult,Resolution_Exceptional_Adult,Resolution_Open_or_Active,Resolution_Unfounded
11,2022/06/27,1900-01-01 12:00:00,2022,Monday,Lost Property,Central,37.787359,-122.408227,19.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
13,2023/03/16,1900-01-01 17:30:00,2023,Thursday,Assault,Bayview,37.762290,-122.401324,54.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
33,2023/03/21,1900-01-01 15:50:00,2023,Tuesday,Non-Criminal,Northern,37.787038,-122.418271,50.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
61,2021/08/22,1900-01-01 09:40:00,2021,Sunday,Warrant,Northern,37.793977,-122.429804,102.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
87,2022/07/02,1900-01-01 22:53:00,2022,Saturday,Assault,Bayview,37.719298,-122.390020,88.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797679,2023/03/22,1900-01-01 13:30:00,2023,Wednesday,Robbery,Mission,37.753004,-122.406336,54.0,Cite or Arrest Adult,...,0,0,0,0,0,0,1,0,0,0
797680,2023/03/22,1900-01-01 00:13:00,2023,Wednesday,Larceny Theft,Northern,37.788995,-122.440624,102.0,Cite or Arrest Adult,...,0,0,0,0,0,0,1,0,0,0
797681,2023/03/21,1900-01-01 21:19:00,2023,Tuesday,Non-Criminal,Ingleside,37.708806,-122.452665,66.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0
797682,2023/03/22,1900-01-01 15:28:00,2023,Wednesday,Larceny Theft,Central,37.790565,-122.405574,19.0,Open or Active,...,0,0,0,0,0,0,0,0,1,0


#### Dropping NAs
8% of rows removed; 92% of the original rows kept;

Before:
`797684 rows`

After:
`737791 rows`



# PREDICT WHETHER THE NEXT CRIME WILL BE AN ARREST

In [6]:
# Define time conditions
morning_condition = (300 < df['Time_in_Minutes']) & (df['Time_in_Minutes'] <= 780)
afternoon_condition = (780 < df['Time_in_Minutes']) & (df['Time_in_Minutes'] <= 1140)
night_condition = (0 < df['Time_in_Minutes']) & (df['Time_in_Minutes'] <= 300) | (1140 < df['Time_in_Minutes']) & (df['Time_in_Minutes'] <= 1440)

# Assign values based on time of day
df['Morning'] = np.where(morning_condition, 1, 0)
df['Afternoon'] = np.where(afternoon_condition, 1, 0)
df['Night'] = np.where(night_condition, 1, 0)

# 'Is_Arrest' column
df['Is_Arrest'] = (df['Resolution'] == 'Cite or Arrest Adult').astype(int)

# Calculate the rolling 7-day sum and the previous day's arrests
daily_arrests = df.groupby('Incident_Date')['Is_Arrest'].sum()
rolling_arrests = daily_arrests.rolling(window=7, min_periods=1).sum()
previous_day_arrests = daily_arrests.shift(1).fillna(0).astype(int)

# Map the rolling sums and the previous day's arrests back to the DataFrame
df['Number_Arrests_7days'] = df['Incident_Date'].map(rolling_arrests).astype(int)
df['Number_Arrests_Previous_Day'] = df['Incident_Date'].map(previous_day_arrests)

df.head()


Unnamed: 0,Incident_Date,Incident_Time,Incident_Year,Incident_Day_of_Week,Incident_Category,Police_District,Latitude,Longitude,Neighborhoods,Resolution,...,Resolution_Cite_or_Arrest_Adult,Resolution_Exceptional_Adult,Resolution_Open_or_Active,Resolution_Unfounded,Morning,Afternoon,Night,Is_Arrest,Number_Arrests_7days,Number_Arrests_Previous_Day
11,2022/06/27,1900-01-01 12:00:00,2022,Monday,Lost Property,Central,37.787359,-122.408227,19.0,Open or Active,...,0,0,1,0,1,0,0,0,349,41
13,2023/03/16,1900-01-01 17:30:00,2023,Thursday,Assault,Bayview,37.76229,-122.401324,54.0,Open or Active,...,0,0,1,0,0,1,0,0,475,95
33,2023/03/21,1900-01-01 15:50:00,2023,Tuesday,Non-Criminal,Northern,37.787038,-122.418271,50.0,Open or Active,...,0,0,1,0,0,1,0,0,484,62
61,2021/08/22,1900-01-01 09:40:00,2021,Sunday,Warrant,Northern,37.793977,-122.429804,102.0,Open or Active,...,0,0,1,0,1,0,0,0,371,47
87,2022/07/02,1900-01-01 22:53:00,2022,Saturday,Assault,Bayview,37.719298,-122.39002,88.0,Open or Active,...,0,0,1,0,0,0,1,0,415,65


#### Write csv for next model

In [12]:
# Write DataFrame 'df' to a CSV file
df.to_csv('data/SF_Crime_Report_cleaned.csv', index=False)


In [13]:
df.columns

Index(['Incident_Date', 'Incident_Time', 'Incident_Year',
       'Incident_Day_of_Week', 'Incident_Category', 'Police_District',
       'Latitude', 'Longitude', 'Neighborhoods', 'Resolution', 'Month', 'Day',
       'Time_in_Minutes', 'Incident_Day_of_Week_Friday',
       'Incident_Day_of_Week_Monday', 'Incident_Day_of_Week_Saturday',
       'Incident_Day_of_Week_Sunday', 'Incident_Day_of_Week_Thursday',
       'Incident_Day_of_Week_Tuesday', 'Incident_Day_of_Week_Wednesday',
       'Incident_Category_Arson', 'Incident_Category_Assault',
       'Incident_Category_Burglary', 'Incident_Category_Case_Closure',
       'Incident_Category_Civil_Sidewalks',
       'Incident_Category_Courtesy_Report',
       'Incident_Category_Disorderly_Conduct',
       'Incident_Category_Drug_Offense', 'Incident_Category_Drug_Violation',
       'Incident_Category_Embezzlement', 'Incident_Category_Fire_Report',
       'Incident_Category_Forgery_And_Counterfeiting',
       'Incident_Category_Fraud', 'Incident_C

In [14]:
predict_arrest_df = df.drop(columns= ['Incident_Date', 'Incident_Time', 'Incident_Year',
       'Incident_Day_of_Week', 'Incident_Category', 'Police_District','Neighborhoods',
       'Resolution','Day','Time_in_Minutes',  'Incident_Category_Motor_Vehicle_Theft','Resolution_Cite_or_Arrest_Adult', 
       'Resolution_Exceptional_Adult','Resolution_Open_or_Active', 'Resolution_Unfounded'])

# Split Data Into Train and Test
Using random state 88. Test contains 30% of data.

In [15]:

sf_data_arrest_train, sf_data_arrest_test = train_test_split(predict_arrest_df, test_size=0.2, random_state=88)


## RANDOM FOREST, PREDICTION COLUMN: ARREST

In [25]:
# Prepare the features (X) and target (y)
X_train = sf_data_arrest_train.drop(columns='Is_Arrest')
y_train = sf_data_arrest_train['Is_Arrest']

X_test = sf_data_arrest_test.drop(columns='Is_Arrest')
y_test = sf_data_arrest_test['Is_Arrest']

#Define parameters
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20, None],
    'random_state': [88]
}

# Create a Random Forest Classifier
rf_classifier_arrest = RandomForestClassifier(random_state=42, n_jobs=-1)

# Create GridSearchCV object
grid_search_rf = GridSearchCV(rf_classifier_arrest, param_grid_rf, cv=5, n_jobs=-1)

# Train the model using GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Predict on test data using the best model
y_pred_arrest = grid_search_rf.predict(X_test)

# Evaluate the best model
print("Random Forest Classifier with GridSearchCV:")
print(f"Best Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_arrest)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_arrest))

# To inspect feature importances of the best model
feature_importances = grid_search_rf.best_estimator_.feature_importances_

KeyboardInterrupt: 

## Time takes to run + performance with best parameter
#### Using 10 core CPUs
* max_depth [10], n_estimators = 100; took 23s

Random Forest Classifier with GridSearchCV:
Best Parameters: {'max_depth': 10, 'n_estimators': 100, 'random_state': 88}
Accuracy: 0.8695979235424475
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92    118663
           1       0.93      0.36      0.52     28896

    accuracy                           0.87    147559
    macro avg      0.90      0.68      0.72    147559
    weighted avg   0.88      0.87      0.85    147559



* max_depth [None, 10], n_estimators = 100; took 1m 20s

Random Forest Classifier with GridSearchCV:
Best Parameters: {'max_depth': None, 'n_estimators': 100, 'random_state': 88}
Accuracy: 0.8970513489519446
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94    118663
           1       0.81      0.62      0.70     28896

    accuracy                           0.90    147559
    macro avg      0.86      0.79      0.82    147559
    weighted avg   0.89      0.90      0.89    147559


* max_depth [10], n_estimators = [100, 200]; took: 58 sec

Random Forest Classifier with GridSearchCV:
Best Parameters: {'max_depth': 10, 'n_estimators': 100, 'random_state': 88}
Accuracy: 0.8695979235424475
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92    118663
           1       0.93      0.36      0.52     28896

    accuracy                           0.87    147559
    macro avg      0.90      0.68      0.72    147559
    weighted avg   0.88      0.87      0.85    147559


*  max_depth [10], n_estimators = [100], max_features = [0.2, 0.5]; took: 1min 32 sec

Random Forest Classifier with GridSearchCV:
Best Parameters: {'max_depth': 10, 'max_features': 0.5, 'n_estimators': 100, 'random_state': 88}
Accuracy: 0.8792415237294913
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93    118663
           1       0.82      0.49      0.61     28896

    accuracy                           0.88    147559
    macro avg      0.85      0.73      0.77    147559
    weighted avg   0.87      0.88      0.87    147559

*  max_depth [10], n_estimators = [100],  'bootstrap': [True, False],; took: 44 sec

    Random Forest Classifier with GridSearchCV:
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'n_estimators': 100, 'random_state': 88}
Accuracy: 0.8695979235424475
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92    118663
           1       0.93      0.36      0.52     28896

    accuracy                           0.87    147559
    macro avg      0.90      0.68      0.72    147559
    weighted avg   0.88      0.87      0.85    147559

In [None]:
important_features_arrest = pd.DataFrame({'Feature' : X_train.columns, 
              'Importance score': 100*feature_importances}).round(1)

important_features_arrest.sort_values(by = 'Importance score', ascending=False)