In this porject a dataset containing crime statistics in the San Francisco area from X to Y is analyzed with the goal of creating a model to predict the nature of a crime based only on its time and location. 

More information on the project and its dataset can be found here:
https://www.kaggle.com/c/sf-crime

In [1]:
import sys, os
import pandas
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from pandas.tseries.holiday import USFederalHolidayCalendar as holidaysCalendar
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Loading data into training, dev, and test sets using headings provided from the sample submission

In [11]:
# Define header (from sample submission)
header = ["ID", "ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY",
          "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC",
          "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES",
          "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT",
          "LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES",
          "PORNOGRAPHY/OBSCENE MAT", "PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY",
          "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE", "SEX OFFENSES NON FORCIBLE",
          "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS", "VANDALISM",
          "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"]

# Load training data
df = pandas.read_csv("train.csv")
# Drop duplicates
df = df.drop_duplicates()

# Binarize with dummy variables
dummies_df = df[['DayOfWeek', 'PdDistrict']]
dummies_df = pandas.get_dummies(dummies_df)

# Format datetime
## Time of day
df['Time'] = pandas.to_datetime(df['Dates']).dt.hour
## Week of year
df['Week'] = pandas.to_datetime(df['Dates']).dt.week
## Holiday
#cal = holidaysCalendar()
#holidays = cal.holidays(start=pandas.to_datetime(df['Dates']).min(), end=pandas.to_datetime(df['Dates']).max())
#df['Holiday'] = pandas.to_datetime(df['Dates']).isin(holidays)

# Drop unnecessary columns
df = df.drop(['Address', 'Resolution', 'Descript', 'Dates', 'DayOfWeek', 'PdDistrict'],axis=1)

# Format Coords
df['Y'] = df['Y'].apply(lambda x: round(x,4))
df['X'] = df['X'].apply(lambda x: round(x,4))
#df['Coords'] = df['X'].astype(str) + ", " + df['Y'].astype(str)

parsed_df = df.join(dummies_df)

# Split into data/labels, train/dev
train_data = parsed_df[parsed_df.columns.difference(['Category'])][:50000]
train_labels = parsed_df['Category'][:50000]

dev_data = parsed_df[parsed_df.columns.difference(['Category'])][-5000:]
dev_labels = parsed_df['Category'][-5000:]

# Labels list
labels = list(set(train_labels))
train_labels = train_labels.apply(lambda x: labels.index(x))
dev_labels = dev_labels.apply(lambda x: labels.index(x))

In [None]:
# Load test data
df_test = pandas.read_csv("test.csv")
# Drop duplicates
df_test = df_test.drop_duplicates()

# Binarize with dummy variables
dummies_df = df_test[['DayOfWeek', 'PdDistrict']]
dummies_df = pandas.get_dummies(dummies_df)

# Format datetime
## Time of day
df_test['Time'] = pandas.to_datetime(df_test['Dates']).dt.hour
## Week of year
df_test['Week'] = pandas.to_datetime(df_test['Dates']).dt.week
## Holiday
#cal = holidaysCalendar()
#holidays = cal.holidays(start=pandas.to_datetime(df['Dates']).min(), end=pandas.to_datetime(df['Dates']).max())
#df['Holiday'] = pandas.to_datetime(df['Dates']).isin(holidays)

# Drop unnecessary columns
df_test = df_test.drop(['Address', 'Dates', 'DayOfWeek', 'PdDistrict', 'Id'],axis=1)

# Format Coords
df_test['Y'] = df_test['Y'].apply(lambda x: round(x,4))
df_test['X'] = df_test['X'].apply(lambda x: round(x,4))
#df_test['Coords'] = df_test['X'].astype(str) + ", " + df_test['Y'].astype(str)

parsed_df = df_test.join(dummies_df)

# Clean
df = dummies_df = parsed_df = None

#### Train and fit a K-NN classifier using training data. Predict test set labels and output to a compressed CSV file

In [None]:
# Train KNN Classifier
neigh = KNeighborsClassifier(n_neighbors=15)

# Fit
neigh.fit(train_data, train_labels) 

# Predict
result = neigh.predict(test_data)
result = pandas.DataFrame(result)
result = pandas.get_dummies(result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
result = result.T.reindex(header).T.fillna(0)

# Output
result.to_csv("output_knn.csv", compression='gzip', chunksize=1000)

# Return score
#neigh.score(dev_data, dev_labels)

# Train RandomForestClassifier
#rfc = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
# Fit
#rfc.fit(train_data, train_labels)
# Predict
#rfc.score(dev_data, dev_labels)

#result = rfc.predict(test_data)
#result = pandas.DataFrame(result)
#result = pandas.get_dummies(result)

#result.to_csv("output_rfc.csv", compression='gzip', chunksize=1000)

#### Neural-Network

In [12]:
numFeatures = len(train_data.columns)
numClasses = len(labels)
numSamples = len(train_data)
numTestExamples = len(dev_data)

print('Features = %d' %(numFeatures))
print('Classes = %d' %(numClasses))
print('Train set = %d' %(numSamples))
print('Test set = %d' %(numTestExamples))

Features = 21
Classes = 37
Train set = 50000
Test set = 5000


In [13]:
# (1) Parameters
tf.reset_default_graph()

# Constants
testX = tf.constant(dev_data.values, dtype=tf.float32)
hiddenlayer1_size = 2
hiddenlayer2_size = 1
miniBatchSize = 1

# placeholders
x_ = tf.placeholder(tf.float32, shape=[None, numFeatures], name='x')
y_ = tf.placeholder(tf.int32, shape=[None], name='y')

# and Variables
w1 = tf.get_variable('w1', shape=[numFeatures, hiddenlayer1_size])
b1 = tf.get_variable('b1', shape=[hiddenlayer1_size])
w2 = tf.get_variable('w2', shape=[hiddenlayer1_size, numClasses])
b2 = tf.get_variable('b3', shape=[numClasses])


# (2) Model
def model(input_layer):
    hidden_layer1 = tf.nn.sigmoid(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer

# (2) Model
def model_r(input_layer):
    hidden_layer1 = tf.nn.relu(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer
    

# (3) Cost
def cost(data, labels):
    cc = tf.sqrt(tf.square(labels - model(data)))
    return  cc

# (4) Ojbective (and solver)
y_one_hot = tf.one_hot(y_, numClasses)
cc = cost(x_, y_one_hot)
gd = tf.train.GradientDescentOptimizer(0.1)
step = gd.minimize(cc)
test_preds = model(testX)
test_preds_r = model_r(testX)
output = ""
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    cost_vec = []
    cost_vec_r = []
    for i in range(15):
        print (i)
        for start, end in zip(range(0, numSamples, miniBatchSize), range(miniBatchSize, numSamples, miniBatchSize)):
            batch = train_data.values[start:end], train_labels[start:end]
            _, cost, test__preds_r = sess.run([step, cc, test_preds_r], feed_dict={x_: batch[0], y_: batch[1]})
    
    prediction=tf.argmax(test_preds_r,axis=1)
    output = prediction.eval(feed_dict={x_: test_data.values})
    print ("hi")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


NameError: name 'test_data' is not defined