In this project a dataset containing crime statistics in the San Francisco area from X to Y is analyzed with the goal of creating a model to predict the nature of a crime based only on its time and location. 

More information on the project and its dataset can be found here:
https://www.kaggle.com/c/sf-crime

In [3]:
import sys, os
import numpy as np
import pandas
import matplotlib.pyplot as plt

from pandas.tseries.holiday import USFederalHolidayCalendar as holidaysCalendar

#### Load data

In [5]:
# Define header (from sample submission)
header = ["ID", "ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY",
          "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC",
          "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES",
          "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT",
          "LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES",
          "PORNOGRAPHY/OBSCENE MAT", "PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY",
          "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE", "SEX OFFENSES NON FORCIBLE",
          "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS", "VANDALISM",
          "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"]

# Load data
df_train = pandas.read_csv("train.csv")
df_test = pandas.read_csv("test.csv")

# Drop duplicates
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

# Binarize with dummy variables
dummies_df_train = df_train[['DayOfWeek', 'PdDistrict']]
dummies_df_train = pandas.get_dummies(dummies_df_train)
dummies_df_test = df_test[['DayOfWeek', 'PdDistrict']]
dummies_df_test = pandas.get_dummies(dummies_df_test)

# Format Dates
## Time of day
df_train['Time'] = pandas.to_datetime(df_train['Dates']).dt.hour
df_test['Time'] = pandas.to_datetime(df_test['Dates']).dt.hour
## Week of year
df_train['Week'] = pandas.to_datetime(df_train['Dates']).dt.week
df_test['Week'] = pandas.to_datetime(df_test['Dates']).dt.week
## Holidays
#cal = holidaysCalendar()
#holidays = cal.holidays(start=pandas.to_datetime(df['Dates']).min(), end=pandas.to_datetime(df['Dates']).max())
#df['Holiday'] = pandas.to_datetime(df['Dates']).isin(holidays)

# Drop unnecessary columns
df_train = df_train.drop(['Address', 'Resolution', 'Descript', 'Dates', 'DayOfWeek', 'PdDistrict'],axis=1)
df_test = df_test.drop(['Address', 'Dates', 'DayOfWeek', 'PdDistrict'],axis=1)

# Format Coords
df_train['Y'] = df_train['Y'].apply(lambda x: round(x,4))
df_train['X'] = df_train['X'].apply(lambda x: round(x,4))
#df['Coords'] = df['X'].astype(str) + ", " + df['Y'].astype(str)
df_test['Y'] = df_test['Y'].apply(lambda x: round(x,4))
df_test['X'] = df_test['X'].apply(lambda x: round(x,4))
#df['Coords'] = df['X'].astype(str) + ", " + df['Y'].astype(str)

# Join
parsed_df_train = df_train.join(dummies_df_train)
parsed_df_test = df_test.join(dummies_df_test)

# Split into data/labels, train/dev
train_data = parsed_df_train[parsed_df_train.columns.difference(['Category'])][:50000]
train_labels = parsed_df_train['Category'][:50000]

dev_data = parsed_df_train[parsed_df_train.columns.difference(['Category'])][-5000:]
dev_labels = parsed_df_train['Category'][-5000:]

# Labels list
labels = list(set(train_labels))
train_labels = train_labels.apply(lambda x: labels.index(x))
dev_labels = dev_labels.apply(lambda x: labels.index(x))

### Baseline Model
##### K-Nearest Neighbor with k=50

In [None]:
from sklearn.neighbors import KNeighborsClassifier


# Train KNN Classifier
neigh = KNeighborsClassifier(n_neighbors=50)

# Fit
neigh.fit(train_data, train_labels) 

# Score
neigh.score(dev_data, dev_labels)

# Predict
neigh_result = neigh.predict(test_data)

# Output
neigh_result = pandas.DataFrame(neigh_result)
neigh_result = pandas.get_dummies(neigh_result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
neigh_result = neigh_result.T.reindex(header).T.fillna(0)
neigh_result.to_csv("output_knn.csv", compression='gzip', chunksize=1000)

### Logistic Regression
##### Final Parameters:

### Support Vector Machines
##### Final Parameters:

### Random Forests
##### Final Parameters:

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)

# Fit
rfc.fit(train_data, train_labels)

# Score
rfc.score(dev_data, dev_labels)

# Predict
rfc_result = rfc.predict(test_data)

# Output
rfc_result = pandas.DataFrame(rfc_result)
rfc_result = pandas.get_dummies(rfc_result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
rfc_result = rfc_result.T.reindex(header).T.fillna(0)
rfc_result.to_csv("output_rfc.csv", compression='gzip', chunksize=1000)

### Neural-Network
##### Final Parameters:

In [6]:
import tensorflow as tf

numFeatures = len(train_data.columns)
numClasses = len(labels)
numSamples = len(train_data)
numTestExamples = len(dev_data)

#### Sequential

In [39]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten, Reshape

# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=64, activation='relu', input_shape=(numFeatures,)))
## model.add(Dense(32, input_shape=(16,)))
## Now the model will take as input arrays of shape (*, 16)
## and output arrays of shape (*, 32)
sequential_model.add(Dense(units=10, activation='softmax'))
## After the first layer, you don't need to specify
## the size of the input anymore
# Compile
sequential_model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

# Fit
sequential_model.fit(train_data.values.reshape(-1,21), train_labels.values.reshape(-1,21), epochs=10, batch_size=100)

# Score
sequential_model.evaluate(dev_data, dev_labels, batch_size=128)

# Predict
keras_seq_result = sequential_model.predict(test_data, batch_size=128)

# Output
keras_seq_result = pandas.DataFrame(keras_seq_result)
keras_seq_result = pandas.get_dummies(keras_seq_result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
keras_seq_result = keras_seq_result.T.reindex(header).T.fillna(0)
keras_seq_result.to_csv("output_keras_seq.csv", compression='gzip', chunksize=1000)

ValueError: cannot reshape array of size 50000 into shape (21)

#### Dense

In [None]:


# Define
dense_model = Sequential([
    Dense(32, input_dim=2),
    Activation('relu'),
    Dense(10),
    Activation('softmax'),
])

# Compile
dense_model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

# Fit
dense_model.fit(x_train, y_train, epochs=5, batch_size=32)

# Score
dense_model.evaluate(x_test, y_test, batch_size=128)

# Predict
classes = dense_model.predict(x_test, batch_size=128)

In [None]:
# (1) Parameters
tf.reset_default_graph()

# Constants
testX = tf.constant(dev_data.values, dtype=tf.float32)
hiddenlayer1_size = 2
hiddenlayer2_size = 1
miniBatchSize = 1

# placeholders
x_ = tf.placeholder(tf.float32, shape=[None, numFeatures], name='x')
y_ = tf.placeholder(tf.int32, shape=[None], name='y')

# and Variables
w1 = tf.get_variable('w1', shape=[numFeatures, hiddenlayer1_size])
b1 = tf.get_variable('b1', shape=[hiddenlayer1_size])
w2 = tf.get_variable('w2', shape=[hiddenlayer1_size, numClasses])
b2 = tf.get_variable('b3', shape=[numClasses])


# (2) Model
def model(input_layer):
    hidden_layer1 = tf.nn.sigmoid(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer

# (2) Model
def model_r(input_layer):
    hidden_layer1 = tf.nn.relu(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer
    

# (3) Cost
def cost(data, labels):
    cc = tf.sqrt(tf.square(labels - model(data)))
    return  cc

# (4) Ojbective (and solver)
y_one_hot = tf.one_hot(y_, numClasses)
cc = cost(x_, y_one_hot)
gd = tf.train.GradientDescentOptimizer(0.1)
step = gd.minimize(cc)
test_preds = model(testX)
test_preds_r = model_r(testX)
output = ""
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    cost_vec = []
    cost_vec_r = []
    for i in range(15):
        print (i)
        for start, end in zip(range(0, numSamples, miniBatchSize), range(miniBatchSize, numSamples, miniBatchSize)):
            batch = train_data.values[start:end], train_labels[start:end]
            _, cost, test__preds_r = sess.run([step, cc, test_preds_r], feed_dict={x_: batch[0], y_: batch[1]})
    
    prediction=tf.argmax(test_preds_r,axis=1)
    output = prediction.eval(feed_dict={x_: test_data.values})
    print ("hi")