In [1]:
import pandas as pd
import helpers

from IPython.display import display
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
# from sklearn.metrics import log_loss
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
import numpy as np

# convert the Dates column of our provided data from string to datetime format.
train=pd.read_csv('train.csv', parse_dates = ['Dates'])
test=pd.read_csv('test.csv', parse_dates = ['Dates'])

# Print the first 5 rows of the dataframe.
print train.head(2)


                Dates        Category                  Descript  DayOfWeek  \
0 2015-05-13 23:53:00        WARRANTS            WARRANT ARREST  Wednesday   
1 2015-05-13 23:53:00  OTHER OFFENSES  TRAFFIC VIOLATION ARREST  Wednesday   

  PdDistrict      Resolution             Address           X          Y  
0   NORTHERN  ARREST, BOOKED  OAK ST / LAGUNA ST -122.425892  37.774599  
1   NORTHERN  ARREST, BOOKED  OAK ST / LAGUNA ST -122.425892  37.774599  


In [2]:
def filter_cats(data):
    # Filters the dataset for the most popular crime categories 
    # "most popular" being categories with more than 5000 incidents
    df = data.groupby("Category")
    keep_cats = []
    drop_cats = []
    keep_cats_dataframes = []
    for c in data["Category"].unique():
        if len(df.get_group(c)) > 13000:
            keep_cats.append(c)
            keep_cats_dataframes.append(data[data["Category"] == c])
        else:
            drop_cats.append(c)

    filtered_df = pd.concat(keep_cats_dataframes)
    return filtered_df, keep_cats

filtered_train, keep_cats = filter_cats(train)

training, validation = train_test_split(filtered_train, train_size=.60)

In [3]:
def prep(data):
    days = pd.get_dummies(data.DayOfWeek)
    district = pd.get_dummies(data.PdDistrict)
    
    # Gets the hour portion form the "Dates" column
    hour = data.Dates.dt.hour
    
    #Build new array with binary info on location, day and hour and numbered crime
    train_data = pd.concat([hour, days, district], axis=1)
    return train_data


preped_filtered_train = prep(training)
# 878049 x 18 columns
display(preped_filtered_train.head())

Unnamed: 0,Dates,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
779309,18,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
30821,14,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
65133,14,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6669,11,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
452456,10,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [4]:
#Convert crime labels to numbers
le_crime = preprocessing.LabelEncoder()
encodedCrime = le_crime.fit_transform(training.Category)
humanReadableCrime = le_crime.inverse_transform(encodedCrime)
print humanReadableCrime
print encodedCrime

['LARCENY/THEFT' 'VEHICLE THEFT' 'ROBBERY' ..., 'OTHER OFFENSES'
 'OTHER OFFENSES' 'BURGLARY']
[ 4 11  8 ...,  7  7  1]


In [7]:
from sklearn import tree
X = preped_filtered_train
Y = encodedCrime
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [8]:
keep_cats_dataframes = []
for c in keep_cats:
    keep_cats_dataframes.append(validation[validation["Category"] == c])
filtered_validation_df = pd.concat(keep_cats_dataframes)

preped_filtered_validation = prep(filtered_validation_df)
validationPredictions = clf.predict(preped_filtered_validation)

In [9]:
humanReadableValidationPredictions = le_crime.inverse_transform(validationPredictions)

In [11]:
validationTrueLabels = le_crime.fit_transform(filtered_validation_df.Category)

In [15]:
print validationPredictions[0:20]
print validationTrueLabels[0:20]
counter = 0
for i in range(len(validationPredictions)):
    if validationPredictions[i] == validationTrueLabels[i]:
        counter+=1

accuracy = counter/float(len(validationPredictions))
print accuracy

[0 7 7 4 4 4 7 4 7 4 7 4 4 4 2 4 4 7 2 4]
[12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12]
0.24578737841


In [22]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

preppedTrainData = prep(filter_cats(train)[0])
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=4, min_samples_leaf=1)
scores = cross_validation.cross_val_score(alg, preppedTrainData, filter_cats(train)[0]["Category"], cv=3)

print scores.mean()

0.242081142127


After testing both a simple decision tree and a RandomForrest algorith from sklearn we have resulted with a pretty low accuracy score. We believe that the one-hot-encoding method we used (prep()) to encode categorical data, such as days of the week and 