Import appropriate libraries

In [33]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import csv
import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

# Set Numpy to print all lines of arrays
np.set_printoptions(threshold='nan')

Read data into Pandas dataframe for ease of use

In [2]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Extract new features here because it's easier in Pandas than NumPy
def time_features(data):
    data['DateTime'] = pd.to_datetime(data['Dates'])
    data['Year'] = pd.DatetimeIndex(data['DateTime']).year
    data['Month'] = pd.DatetimeIndex(data['DateTime']).month
    data['Day'] = pd.DatetimeIndex(data['DateTime']).day
    data['Hour'] = pd.DatetimeIndex(data['DateTime']).hour
    data['SecondsDelta'] = (data.DateTime - pd.Timestamp('2013-01-01')) / np.timedelta64(1,'s')
    data['Weekend'] = (data.DayOfWeek == "Saturday") | (data.DayOfWeek == "Sunday")
    years = pd.get_dummies(data.Year)
    months = pd.get_dummies(data.Month)
    months.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    days = pd.get_dummies(data.Day)
    daysofweek = pd.get_dummies(data.DayOfWeek)
    hours = pd.get_dummies(data.Hour)
    hours.columns = ['12AM', '1AM', '2AM', '3AM', '4AM', '5AM',
                     '6AM', '7AM', '8AM', '9AM', '10AM', '11AM',
                     '12PM', '1PM', '2PM', '3PM', '4PM', '5PM',
                     '6PM', '7PM', '8PM', '9PM', '10PM', '11PM']
    districts = pd.get_dummies(data.PdDistrict)
    new_data = pd.concat([data, years, months, days, daysofweek, hours, districts], axis=1)
    return new_data

data = time_features(data)
test = time_features(test)

print data.columns.values

['Dates' 'Category' 'Descript' 'DayOfWeek' 'PdDistrict' 'Resolution'
 'Address' 'X' 'Y' 'DateTime' 'Year' 'Month' 'Day' 'Hour' 'SecondsDelta'
 'Weekend' 2003L 2004L 2005L 2006L 2007L 2008L 2009L 2010L 2011L 2012L
 2013L 2014L 2015L 'Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep'
 'Oct' 'Nov' 'Dec' 1L 2L 3L 4L 5L 6L 7L 8L 9L 10L 11L 12L 13L 14L 15L 16L
 17L 18L 19L 20L 21L 22L 23L 24L 25L 26L 27L 28L 29L 30L 31L 'Friday'
 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday' 'Wednesday' '12AM' '1AM'
 '2AM' '3AM' '4AM' '5AM' '6AM' '7AM' '8AM' '9AM' '10AM' '11AM' '12PM' '1PM'
 '2PM' '3PM' '4PM' '5PM' '6PM' '7PM' '8PM' '9PM' '10PM' '11PM' 'BAYVIEW'
 'CENTRAL' 'INGLESIDE' 'MISSION' 'NORTHERN' 'PARK' 'RICHMOND' 'SOUTHERN'
 'TARAVAL' 'TENDERLOIN']


In [4]:
# # Convert Pandas dataframe into Numpy array
# data_np = np.array(data.values)
# test_np = np.array(test.values)

In [5]:
# # Scale features between minimum and maximum values
# def scale_features(data):
#     scaled_data = np.arange(data.shape[0])
#     for col in range(1, data.shape[1]):
#         if (type(data[0, col]) == int) | (type(data[0, col]) == float):
#             mms = MinMaxScaler()
#             new_col = mms.fit_transform(data[:, col])
#             scaled_data = np.column_stack((scaled_data, new_col))
#         else:
#             scaled_data = np.column_stack((scaled_data, data[:, col]))
#     return scaled_data

# scale = False
# if scale:
#     data_np2 = scale_features(data_np)
#     test_np2 = scale_features(test_np)
# else:
#     data_np2 = np.copy(data_np)
#     test_np2 = np.copy(test_np)

In [6]:
# Separate labels
labels = data.Category

# Drop Category, Descript and Resolution columns since we cannot use them to predict
train_data = data.drop(['Category', 'Descript', 'Resolution'], axis=1)
train_names = train_data.columns.values.tolist()
test_names = test.columns.values.tolist()


In [25]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to features.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.

shuffle = np.random.permutation(np.arange(train_data.shape[0]))
train_data = train_data.reindex(shuffle)
labels = labels.reindex(shuffle)
num_examples = train_data.shape[0]

# Split the feature and label sets into train and dev sets
mini_train_data = train_data[:5000]
mini_train_labels = labels[:5000]

reg_train_data = train_data[5001:num_examples/2]
reg_train_labels = labels[5001:num_examples/2]

dev_data = train_data[num_examples/2 + 1:]
dev_labels = labels[num_examples/2 + 1:]

test_data = test.copy()

print "Mini Train Data:", mini_train_data.shape
print "Mini Train Labels:", mini_train_labels.shape
print "Regular Train Data:", reg_train_data.shape
print "Regular Train Labels:", reg_train_labels.shape
print "Dev Data:", dev_data.shape
print "Dev Labels:", dev_labels.shape
print "Test Data:", test_data.shape
print "Columns in use:", train_names

Mini Train Data: (5000, 110)
Mini Train Labels: (5000L,)
Regular Train Data: (434023, 110)
Regular Train Labels: (434023L,)
Dev Data: (439024, 110)
Dev Labels: (439024L,)
Test Data: (884262, 111)
Columns in use: ['Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'DateTime', 'Year', 'Month', 'Day', 'Hour', 'SecondsDelta', 'Weekend', 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', '12AM', '1AM', '2AM', '3AM', '4AM', '5AM', '6AM', '7AM', '8AM', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', '5PM', '6PM', '7PM', '8PM', '9PM', '10PM', '11PM', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHER

In [36]:
features_to_use = [2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 
                   'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 
                   1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
                   16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
                   'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 
                   '12AM', '1AM', '2AM', '3AM', '4AM', '5AM', '6AM', '7AM', '8AM', '9AM', '10AM', '11AM', 
                   '12PM', '1PM', '2PM', '3PM', '4PM', '5PM', '6PM', '7PM', '8PM', '9PM', '10PM', '11PM', 
                   'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 
                   'SOUTHERN', 'TARAVAL', 'TENDERLOIN'
                   ]

bnb = BernoulliNB()
bnb.fit(reg_train_data[features_to_use], reg_train_labels)

predictions = bnb.predict_proba(test_data[features_to_use])
print "BernoulliNB accuracy:", bnb.score(dev_data[features_to_use], dev_labels)
print "Log Loss:", log_loss(dev_labels, bnb.predict_proba(dev_data[features_to_use])) 

BernoulliNB accuracy: 0.227131090783
Log Loss: 2.56620491578


In [38]:
def create_submission(preds):
    labels = ["Id",
                "ARSON",
                "ASSAULT",
                "BAD CHECKS",
                "BRIBERY",
                "BURGLARY",
                "DISORDERLY CONDUCT",
                "DRIVING UNDER THE INFLUENCE",
                "DRUG/NARCOTIC",
                "DRUNKENNESS",
                "EMBEZZLEMENT",
                "EXTORTION",
                "FAMILY OFFENSES",
                "FORGERY/COUNTERFEITING",
                "FRAUD",
                "GAMBLING",
                "KIDNAPPING",
                "LARCENY/THEFT",
                "LIQUOR LAWS",
                "LOITERING",
                "MISSING PERSON",
                "NON-CRIMINAL",
                "OTHER OFFENSES",
                "PORNOGRAPHY/OBSCENE MAT",
                "PROSTITUTION",
                "RECOVERED VEHICLE",
                "ROBBERY",
                "RUNAWAY",
                "SECONDARY CODES",
                "SEX OFFENSES FORCIBLE",
                "SEX OFFENSES NON FORCIBLE",
                "STOLEN PROPERTY",
                "SUICIDE",
                "SUSPICIOUS OCC",
                "TREA",
                "TRESPASS",
                "VANDALISM",
                "VEHICLE THEFT",
                "WARRANTS",
                "WEAPON LAWS"
              ]
    head_str = ','.join(labels)

    num_cats = len(labels)
    
    # Make a dummy row to append to
    ids = np.arange(preds.shape[0])[np.newaxis].transpose()
    
    results = np.column_stack((ids, preds))

    num_form = ['%6f'] * (num_cats - 1)
    num_form.insert(0, '%d')
    # Write results to csv
    np.savetxt('sample.csv', results, fmt=num_form, delimiter=',', header=head_str, comments='')

#     return results

In [39]:
create_submission(predictions)