In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime



In [2]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [5]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [6]:
train_df = preprocess(get_random_subset(train))

In [7]:
print train_df.describe()

                 X            Y         Hour        Month  Hour_Minutes  \
count  4997.000000  4997.000000  4997.000000  4997.000000   4997.000000   
mean   -122.422484    37.766955    13.328397     6.470883     13.659599   
std       0.025249     0.024173     6.584094     3.441057      6.593897   
min    -122.513642    37.708154     0.000000     1.000000      0.016667   
25%    -122.432612    37.752647     9.000000     4.000000      9.433333   
50%    -122.416078    37.775421    14.000000     6.000000     14.750000   
75%    -122.406539    37.784385    19.000000    10.000000     19.000000   
max    -122.365565    37.809671    23.000000    12.000000     23.983333   

       Minutes_Since_03  Minutes_Since_New_Year          DOW  Street_Corner  
count       4997.000000             4997.000000  4997.000000    4997.000000  
mean     3263014.941165           260852.971983     2.979988       0.292175  
std      1914601.777390           151348.441090     1.991274       0.454808  
min         

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrices
from sklearn import cross_validation

formula_ml = 'Category~X+Y+Hour'
y_train, x_train = dmatrices(formula_ml, data=train_df, return_type='dataframe')
#print x_train
# print y_train
#y_train = np.asarray(y_train).ravel()

alg = RandomForestClassifier()
# print y_train.shape
# print x_train.shape
scores1 = cross_validation.cross_val_score(alg, x_train, train_df['Category'], cv=3)
scores2 = cross_validation.cross_val_score(alg, train_df[['X', 'Y']], train_df['Category'], cv=3)

print 'Score: '
print str(scores1.mean())
print str(scores2.mean())