# Putting Purpose to the Kaggle San Francisco Crime Data
## Keenan Zucker and Joe Sutker

We used the data from the Kaggle contest for San Francisco Crime prediction in order to both compete in said contest and to predict safety of certain places, possibly as a tool for tourists.

For our submission for the contest, we used the following code:

In [None]:
## Part 1: Grab all of the data
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
import numpy as np
 
%matplotlib inline

# Load Data with pandas, and parse the first column into datetime
train=pd.read_csv('train.csv', parse_dates = ['Dates'])
test=pd.read_csv('test.csv', parse_dates = ['Dates'])

# Keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
train = train[np.abs(train.X-train.X.mean())<=(3*train.X.std())] 
train = train[np.abs(train.Y-train.Y.mean())<=(3*train.Y.std())] 

## Part 2: Process the data
#Convert crime labels to numbers
le_crime = preprocessing.LabelEncoder()
crime = le_crime.fit_transform(train.Category)
 
#Get binarized weekdays, districts, and hours.
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour) 
 
#Build new array
train_data = pd.concat([hour, days, district], axis=1)
train_data['crime']=crime
 
#Repeat for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
 
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 
 
test_data = pd.concat([hour, days, district], axis=1)
 
training, validation = train_test_split(train_data, train_size=.60)

## Part 3: Train the model
features = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
 'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']


# Add in hours of the day into the features
features2 = [x for x in range(0,24)]
features = features + features2
# features = features[0:7]

# Use the Naive Bayes model, after testing various models for validity
training, validation = train_test_split(train_data, train_size=.5)

## Part 4: Results
model = BernoulliNB()
model.fit(train_data[features], train_data['crime'])
predicted = model.predict_proba(test_data[features])
 
#Write results
result=pd.DataFrame(predicted, columns=le_crime.classes_)
result.to_csv('testResultBNB.csv', index = True, index_label = 'Id' )

This code, using the Naive Bayes model for predictions based on the day of the week, the district, and the hour, got our submission a score of 2.58376, which got us to ~530th place out of ~1250.