## San Francisco Crime Classification

In [1]:
#import lib
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss

In [3]:
#Load Data with pandas, and parse the first column into datetime

train = pd.read_csv('train.csv', parse_dates = ['Dates'])
test = pd.read_csv('test.csv', parse_dates = ['Dates'])

In [6]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
#convert Category(label) to number
label_cat = preprocessing.LabelEncoder()
crime = label_cat.fit_transform(train.Category)

In [5]:
# For Train data

#Date
day = train.Dates.dt.day
month = train.Dates.dt.month
year = train.Dates.dt.year
hour = train.Dates.dt.hour
minute = train.Dates.dt.minute
seconds = train.Dates.dt.second

In [7]:
#intersection

inter = train.Address.apply(lambda x: 1 if '/' in x else 0)

In [8]:
# crime at night or not

crime_time = train.Dates.dt.hour.apply(lambda x: 1 if x>22 and x<6 else 0)

In [9]:
# weekday/weekend crime

crime_day = train.Dates.dt.dayofweek.apply(lambda x: 1 if (x==5) or (x==6) else 0)

In [16]:
# PdDistrict

district = pd.get_dummies(train.PdDistrict)
district.columns

Index([u'BAYVIEW', u'CENTRAL', u'INGLESIDE', u'MISSION', u'NORTHERN', u'PARK',
       u'RICHMOND', u'SOUTHERN', u'TARAVAL', u'TENDERLOIN'],
      dtype='object')

In [19]:
#train_data = pd.concat([hour, days, month, district, loc], axis=1)
train_data = pd.concat([hour, minute, inter, crime_time, crime_day, train.X, train.Y, district], axis=1)
train_data['crime'] = crime
train_data.columns = ['hour', 'minute', 'inter', 'crime_time', 'crime_day', 'X', 'Y']+list(district.columns)+['crime']

In [20]:
training, validation = train_test_split(train_data, train_size=.60)

In [21]:
training.columns

Index([u'hour', u'minute', u'inter', u'crime_time', u'crime_day', u'X', u'Y',
       u'BAYVIEW', u'CENTRAL', u'INGLESIDE', u'MISSION', u'NORTHERN', u'PARK',
       u'RICHMOND', u'SOUTHERN', u'TARAVAL', u'TENDERLOIN', u'crime'],
      dtype='object')

In [22]:
features = [u'hour', u'minute', u'inter', u'crime_time', u'crime_day', u'X', u'Y',
       u'BAYVIEW', u'CENTRAL', u'INGLESIDE', u'MISSION', u'NORTHERN', u'PARK',
       u'RICHMOND', u'SOUTHERN', u'TARAVAL', u'TENDERLOIN']

In [23]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()

model.fit(training[features], training['crime'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted) 

2.5179572079122372

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
model = RandomForestClassifier(max_depth=18,min_samples_split=3,n_estimators=22)
model.fit(training[features], training['crime'])

from sklearn.metrics import accuracy_score
predict = model.predict(validation[features])
print "accuracy score :",accuracy_score(predict,validation['crime'])

predicted = np.array(model.predict_proba(validation[features]))
print "log loss : ",log_loss(validation['crime'], predicted) 

accuracy score : 0.302687774045
log loss :  2.49293813114
