Import appropriate libraries

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import csv
import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set Numpy to print all lines of arrays
np.set_printoptions(threshold='nan')

Read data into Pandas dataframe for ease of use

In [14]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Convert Pandas dataframe into Numpy array
data_np = np.array(data.values)
test_np = np.array(test.values)


In [15]:
# Preserve column names since I had a hard time figuring out how to name Numpy columns
col_names_train = data.columns.values.tolist()
col_names_test = test.columns.values.tolist()

# Separate labels
labels = data_np[:, 1]

# Add index number and remove unusable columns from train data to keep consistent with test data
data_np = np.column_stack((np.arange(data.shape[0]), data_np))
col_names_train.insert(0, "Id")

train_cols_keep = [0, 1, 4, 5, 7, 8, 9]

train_data = data_np[:, train_cols_keep]
train_names = [col_names_train[i] for i in range(len(col_names_train)) if i in train_cols_keep]

test_names = col_names_test[:]

print train_names
print train_data[0], "\n"
print test_names
print test_np[0]

['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
[0 '2015-05-13 23:53:00' 'Wednesday' 'NORTHERN' 'OAK ST / LAGUNA ST'
 -122.425891675136 37.7745985956747] 

['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
[0L '2015-05-10 23:59:00' 'Sunday' 'BAYVIEW' '2000 Block of THOMAS AV'
 -122.39958770418998 37.7350510103906]


In [21]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to features.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.

shuffle = np.random.permutation(np.arange(train_data.shape[0]))
train_data = train_data[shuffle]
num_examples = train_data.shape[0]

# Which columns are we going to use for the model?
# For the basleine, let's only use the X and Y coordinates
base_features = [5, 6]

features = train_data[:, base_features]

# Split the feature and label sets into train and dev sets
base_mini_train_data = features[:5000]
base_mini_train_labels = labels[:5000]

base_train_data = features[5001:num_examples/2]
base_train_labels = labels[5001:num_examples/2]

base_dev_data = features[num_examples/2 + 1:]
base_dev_labels = labels[num_examples/2 + 1:]

base_test_data = test_np[:, base_features]

base_names = [train_names[i] for i in range(len(train_names)) if i in base_features]

print "Base Mini Train Data:", base_mini_train_data.shape
print "Base Mini Train Labels:", base_mini_train_labels.shape
print "Base Train Data:", base_train_data.shape
print "Base Train Labels:", base_train_labels.shape
print "Base Dev Data:", base_dev_data.shape
print "Base Dev Labels:", base_dev_labels.shape
print "Base Test Data:", base_test_data.shape
print "Columns in use:", base_names

Base Mini Train Data: (5000L, 2L)
Base Mini Train Labels: (5000L,)
Base Train Data: (434023L, 2L)
Base Train Labels: (434023L,)
Base Dev Data: (439024L, 2L)
Base Dev Labels: (439024L,)
Base Test Data: (884262L, 2L)
Columns in use: ['X', 'Y']


In [17]:
def create_submission(preds):
    labels = ["Id",
                "ARSON",
                "ASSAULT",
                "BAD CHECKS",
                "BRIBERY",
                "BURGLARY",
                "DISORDERLY CONDUCT",
                "DRIVING UNDER THE INFLUENCE",
                "DRUG/NARCOTIC",
                "DRUNKENNESS",
                "EMBEZZLEMENT",
                "EXTORTION",
                "FAMILY OFFENSES",
                "FORGERY/COUNTERFEITING",
                "FRAUD",
                "GAMBLING",
                "KIDNAPPING",
                "LARCENY/THEFT",
                "LIQUOR LAWS",
                "LOITERING",
                "MISSING PERSON",
                "NON-CRIMINAL",
                "OTHER OFFENSES",
                "PORNOGRAPHY/OBSCENE MAT",
                "PROSTITUTION",
                "RECOVERED VEHICLE",
                "ROBBERY",
                "RUNAWAY",
                "SECONDARY CODES",
                "SEX OFFENSES FORCIBLE",
                "SEX OFFENSES NON FORCIBLE",
                "STOLEN PROPERTY",
                "SUICIDE",
                "SUSPICIOUS OCC",
                "TREA",
                "TRESPASS",
                "VANDALISM",
                "VEHICLE THEFT",
                "WARRANTS",
                "WEAPON LAWS"
              ]

    num_cats = len(labels)
    results = np.arange(num_cats)
    for i in range(len(preds)):
        new_row = (preds[i] == np.array(labels[1:])) * 1
        new_row = np.append(i, new_row)
        results = np.vstack([results, new_row])

    results = results[1:,]
    print results

    with open('sample.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(labels)
        writer.writerows(results)

In [22]:
# Baseline features only using location of crime
# Use k-nearest neighbors as baseline model

knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(base_train_data, base_train_labels)
print "Baseline score on dev set: {:.2%}".format(knn.score(base_dev_data, base_dev_labels))


Baseline score on dev set: 9.64%


In [None]:
predictions = knn.predict(base_test_data)
create_submission(predictions)