In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import numpy as np

# Load the data

[link](https://www.kaggle.com/c/sf-crime/data?test.csv.zip)

In [2]:
#Load Data with pandas, and parse the first column into datetime
train_csv = r'data/train.csv'
train=pd.read_csv(train_csv, parse_dates = ['Dates'])

test_csv = r'data/test.csv'
test=pd.read_csv(test_csv, parse_dates = ['Dates'])

# Encode labels

In [12]:
labelEncoder = preprocessing.LabelEncoder()
encodedCategory = labelEncoder.fit_transform(train.Category)

encodedDays = pd.get_dummies(train.DayOfWeek)
encodedDistrict = pd.get_dummies(train.PdDistrict)
trainDf = pd.concat([encodedDays, encodedDistrict], axis=1)

trainDf['Y']=train['Y']
trainDf['X']=train['X']
trainDf['Category']=encodedCategory

encodedDays = pd.get_dummies(test.DayOfWeek)
encodedDistrict = pd.get_dummies(test.PdDistrict)
testDf = pd.concat([encodedDays, encodedDistrict], axis=1)

testDf['Y']=test['Y']
testDf['X']=test['X']

In [5]:
trainDf.head()

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Y,X,Category
0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,37.774599,-122.425892,37
1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,37.774599,-122.425892,21
2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,37.800414,-122.424363,21
3,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,37.800873,-122.426995,16
4,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,37.771541,-122.438738,16


# Naive Bayes classifier on day and kind

In [6]:
features = ['Y', 'X']
features = np.append(features, train.DayOfWeek.unique())
features = np.append(features, train.PdDistrict.unique())

In [7]:
def naiveBayesClassifier (df, size, features, withLogLoss):
    # Split arrays or matrices into random train and test subsets
    training, validation = train_test_split(df, train_size=size)

    # Naive Bayes classifier for multivariate Bernoulli models.
    model = BernoulliNB()
    model.fit(training[features], training['Category'])
    predicted = model.predict_proba(validation[features])

    if withLogLoss:
        print('(size, log_loss) : ',(size,log_loss(validation['Category'], np.array(predicted))))
        
naiveBayesClassifier(trainDf,0.7,features,True)

(size, log_loss) :  (0.7, 2.6133676750054557)


In [8]:
# Train final model of whole training dataset
model = BernoulliNB()
model.fit(trainDf[features], trainDf['Category'])
predicted = model.predict_proba(testDf[features])

# results
result=pd.DataFrame(predicted, columns=labelEncoder.classes_)
#result.to_csv('testResult.csv', index = True, index_label = 'Id')

# Naive Bayes classifier on day, hour and kind

In [9]:
trainDf['Hour']=train.Dates.dt.hour
testDf['Hour']=test.Dates.dt.hour
featuresWithHours = np.append(features, 'Hour')

In [10]:
naiveBayesClassifier(trainDf,0.7,featuresWithHours,True)

(size, log_loss) :  (0.7, 2.6101498910875884)


In [11]:
# Train final model of whole training dataset
model = BernoulliNB()
model.fit(trainDf[featuresWithHours], trainDf['Category'])
predicted = model.predict_proba(testDf[featuresWithHours])

# results
result=pd.DataFrame(predicted, columns=labelEncoder.classes_)
#result.to_csv('testResult.csv', index = True, index_label = 'Id')

# Naive Bayes classifier on day, hour and kind + different training/test split size

In [None]:
naiveBayesClassifier(trainDf,0.6,featuresWithHours,True)
naiveBayesClassifier(trainDf,0.7,featuresWithHours,True)
naiveBayesClassifier(trainDf,0.75,featuresWithHours,True)
naiveBayesClassifier(trainDf,0.77,featuresWithHours,True)
naiveBayesClassifier(trainDf,0.8,featuresWithHours,True)