In [115]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer

In [116]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [117]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [124]:
train_df = preprocess(get_random_subset(train, len(train)))

In [5]:
print train_df.describe()

                 X            Y         Hour        Month  Hour_Minutes  \
count  4998.000000  4998.000000  4998.000000  4998.000000   4998.000000   
mean   -122.422492    37.766937    13.433774     6.475590     13.769411   
std       0.025019     0.024146     6.527471     3.417003      6.543438   
min    -122.513642    37.707922     0.000000     1.000000      0.016667   
25%    -122.431952    37.752173     9.000000     4.000000      9.500000   
50%    -122.416446    37.775243    14.000000     6.000000     14.725000   
75%    -122.407244    37.784459    19.000000     9.000000     19.000000   
max    -122.365565    37.809671    23.000000    12.000000     23.983333   

       Minutes_Since_03  Minutes_Since_New_Year          DOW  Street_Corner  
count       4998.000000             4998.000000  4998.000000    4998.000000  
mean     3241349.093838           261053.295518     3.053621       0.295918  
std      1898358.447366           150700.518203     1.962202       0.456500  
min         

In [130]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'X+Y+Hour'
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner+Hour_Minutes'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
print x_train
y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
y_validation = y_validation[y_validation.isin(y_train.values)]

alg = RandomForestClassifier()
# alg = BernoulliNB()
# print y_train.shape
# print x_train.shape

alg.fit(x_train, y_train)

mlb = MultiLabelBinarizer(classes=alg.classes_)
y_validation = mlb.fit_transform(np.array([y_validation]).T)
predictions = np.array(alg.predict_proba(x_validation))
print log_loss(y_validation, predictions)

# scores1 = cross_validation.cross_val_score(alg, x_train, train_df['Category'], cv=3)
# scores2 = cross_validation.cross_val_score(alg, train_df[['X', 'Y']], train_df['Category'], cv=3)

# print 'Score: '
# print str(scores1.mean())
# print str(scores2.mean())

        Intercept  C(DayOfWeek)[T.Monday]  C(DayOfWeek)[T.Saturday]  \
802387          1                       0                         0   
112500          1                       1                         0   
67286           1                       0                         0   
343777          1                       0                         0   
515355          1                       0                         0   
291892          1                       0                         0   
239599          1                       0                         0   
722366          1                       1                         0   
560159          1                       0                         0   
130605          1                       0                         0   
191008          1                       0                         0   
163782          1                       0                         0   
337862          1                       0                         0   
131401