## San Francisco Crime Classification

In [1]:
#import lib
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB

In [3]:
#Load Data with pandas, and parse the first column into datetime

train = pd.read_csv('train.csv', parse_dates = ['Dates'])
test = pd.read_csv('test.csv', parse_dates = ['Dates'])

##### Preprocessing

In [4]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
#convert Category(label) to number
label_cat = preprocessing.LabelEncoder()
crime = label_cat.fit_transform(train.Category)

In [6]:
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)

In [7]:
month = train.Dates.dt.month
month = pd.get_dummies(month)
month.columns = [24,25,26,27,28,29,30,31,32,33,34,35]

In [8]:
#loc_X = np.array(train.X)
#loc_Y = np.array(train.Y)

In [9]:
#loc_mul = np.multiply(loc_X,loc_Y)
#loc_sum = np.add(loc_X,loc_Y)
#loc = pd.DataFrame(np.divide(loc_mul,loc_sum))
#loc.columns = ['loc']

In [10]:
#train_data = pd.concat([hour, days, month, district, loc], axis=1)
train_data = pd.concat([hour, days, month, district, train.X, train.Y], axis=1)
train_data['crime'] = crime

In [11]:
#for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)  
month = test.Dates.dt.month
month = pd.get_dummies(month)
month.columns = [24,25,26,27,28,29,30,31,32,33,34,35]

In [12]:
'''
loc_X = np.array(test.X)
loc_Y = np.array(test.Y)
loc_mul = np.multiply(loc_X,loc_Y)
loc_sum = np.add(loc_X,loc_Y)
loc = pd.DataFrame(np.divide(loc_mul,loc_sum))
loc.columns = ['loc']

test_data = pd.concat([hour, days, month, district, loc], axis=1)
'''
test_data = pd.concat([hour, days, month, district, test.X, test.Y], axis=1)

In [13]:
training, validation = train_test_split(train_data, train_size=.60)

In [14]:
training.columns

Index([            0,             1,             2,             3,
                   4,             5,             6,             7,
                   8,             9,            10,            11,
                  12,            13,            14,            15,
                  16,            17,            18,            19,
                  20,            21,            22,            23,
           u'Friday',     u'Monday',   u'Saturday',     u'Sunday',
         u'Thursday',    u'Tuesday',  u'Wednesday',            24,
                  25,            26,            27,            28,
                  29,            30,            31,            32,
                  33,            34,            35,    u'BAYVIEW',
          u'CENTRAL',  u'INGLESIDE',    u'MISSION',   u'NORTHERN',
             u'PARK',   u'RICHMOND',   u'SOUTHERN',    u'TARAVAL',
       u'TENDERLOIN',          u'X',          u'Y',      u'crime'],
      dtype='object')

In [15]:
features = [            0,             1,             2,             3,
                   4,             5,             6,             7,
                   8,             9,            10,            11,
                  12,            13,            14,            15,
                  16,            17,            18,            19,
                  20,            21,            22,            23,
           u'Friday',     u'Monday',   u'Saturday',     u'Sunday',
         u'Thursday',    u'Tuesday',  u'Wednesday',            24,
                  25,            26,            27,            28,
                  29,            30,            31,            32,
                  33,            34,            35,    u'BAYVIEW',
          u'CENTRAL',  u'INGLESIDE',    u'MISSION',   u'NORTHERN',
             u'PARK',   u'RICHMOND',   u'SOUTHERN',    u'TARAVAL',
       u'TENDERLOIN',        u'X'  ,          u'Y']


In [16]:
model = BernoulliNB()
model.fit(training[features], training['crime'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted) 

2.5812800984864954

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
model = RandomForestClassifier(max_depth=16,min_samples_split=3,n_estimators=15,max_features=25)
model.fit(training[features], training['crime'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted) 

2.4361693006755387

In [32]:
model = RandomForestClassifier(max_depth=16,min_samples_split=3,n_estimators=15,max_features=25)
model.fit(train_data[features], train_data['crime'])
predicted = model.predict_proba(test_data[features])

In [33]:
#Write results
result=pd.DataFrame(predicted, columns=label_cat.classes_)
result.to_csv('testResult2.csv', index = True, index_label = 'Id' )