## San Francisco Crime Classification

In [1]:
#import lib
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss

In [3]:
#Load Data with pandas, and parse the first column into datetime

train = pd.read_csv('train.csv', parse_dates = ['Dates'])
test = pd.read_csv('test.csv', parse_dates = ['Dates'])

##### Preprocessing

In [4]:
#train.head()

In [5]:
#convert Category(label) to number
label_cat = preprocessing.LabelEncoder()
crime = label_cat.fit_transform(train.Category)

In [6]:
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)
year = train.Dates.dt.year
year = pd.get_dummies(year)

In [7]:
month = train.Dates.dt.month
month = pd.get_dummies(month)
month.columns = [24,25,26,27,28,29,30,31,32,33,34,35]

In [8]:
date = train.Dates.dt.day
#date.unique()
date = pd.get_dummies(date)
date.columns = list(range(41,72))

In [9]:
#loc_X = np.array(train.X)
#loc_Y = np.array(train.Y)

In [10]:
#loc_mul = np.multiply(loc_X,loc_Y)
#loc_sum = np.add(loc_X,loc_Y)
#loc = pd.DataFrame(np.divide(loc_mul,loc_sum))
#loc.columns = ['loc']

In [11]:
#intersection

inter = train.Address.apply(lambda x: 1 if '/' in x else 0)

# crime at night or not

crime_time = train.Dates.dt.hour.apply(lambda x: 1 if (x>22) and (x<6) else 0)

# weekday/weekend crime

crime_day = train.Dates.dt.dayofweek.apply(lambda x: 1 if (x==5) or (x==6) else 0)

In [22]:
#train_data = pd.concat([hour, days, month, district, loc], axis=1)
train_data = pd.concat([hour, date, days, month, year, district, inter, train.X, train.Y, crime_time, crime_day], axis=1)
train_data['crime'] = crime

In [24]:
train_data.columns = list(train_data.columns)[:-3]+['crime_time','crime_day','crime']

In [25]:
train_data.columns

Index([            0,             1,             2,             3,
                   4,             5,             6,             7,
                   8,             9,
       ...
         u'RICHMOND',   u'SOUTHERN',    u'TARAVAL', u'TENDERLOIN',
          u'Address',          u'X',          u'Y', u'crime_time',
        u'crime_day',      u'crime'],
      dtype='object', length=103)

In [26]:
#for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)  
year = test.Dates.dt.year
year = pd.get_dummies(year)
month = test.Dates.dt.month
month = pd.get_dummies(month)
month.columns = [24,25,26,27,28,29,30,31,32,33,34,35]

In [27]:
date = test.Dates.dt.day
date = pd.get_dummies(date)
date.columns = list(range(41,72))

#intersection

inter = test.Address.apply(lambda x: 1 if '/' in x else 0)

# crime at night or not

crime_time = test.Dates.dt.hour.apply(lambda x: 1 if (x>22) and (x<6) else 0)

# weekday/weekend crime

crime_day = test.Dates.dt.dayofweek.apply(lambda x: 1 if (x==5) or (x==6) else 0)

In [28]:
'''
loc_X = np.array(test.X)
loc_Y = np.array(test.Y)
loc_mul = np.multiply(loc_X,loc_Y)
loc_sum = np.add(loc_X,loc_Y)
loc = pd.DataFrame(np.divide(loc_mul,loc_sum))
loc.columns = ['loc']

test_data = pd.concat([hour, days, month, district, loc], axis=1)

test_data = pd.concat([hour, date, days, month, year, district, test.X, test.Y], axis=1)
'''

test_data = pd.concat([hour, date, days, month, year, district, inter, test.X, test.Y, crime_time, crime_day], axis=1)

test_data.columns = list(train_data.columns)[:-3]+['crime_time','crime_day']

In [29]:
features = list(train_data.columns)[:-1]


In [32]:

training, validation = train_test_split(train_data, train_size=.60)

In [33]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(training[features], training['crime'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted) 

2.5218167199386596

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
model = RandomForestClassifier(max_depth=18,min_samples_split=3,n_estimators=22,max_features=34)
model.fit(training[features], training['crime'])

from sklearn.metrics import accuracy_score
predict = model.predict(validation[features])
print "accuracy score :",accuracy_score(predict,validation['crime'])

predicted = np.array(model.predict_proba(validation[features]))
print "log loss : ",log_loss(validation['crime'], predicted) 

accuracy score : 0.293482717385
log loss :  2.37828938819


In [36]:
model = RandomForestClassifier(max_depth=18,min_samples_split=3,n_estimators=22,max_features=34)
model.fit(train_data[features], train_data['crime'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features=34, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=22, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
predicted = model.predict_proba(test_data[features])

In [None]:
#Write results
result=pd.DataFrame(predicted, columns=label_cat.classes_)
result.to_csv('testResult2.csv', index = True, index_label = 'Id' )