In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime

In [2]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [3]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [4]:
train_df = preprocess(get_random_subset(train))

In [5]:
print train_df.describe()

                 X            Y         Hour        Month  Hour_Minutes  \
count  4999.000000  4999.000000  4999.000000  4999.000000   4999.000000   
mean   -122.423268    37.767410    13.395279     6.425485     13.727372   
std       0.025634     0.024204     6.575643     3.406617      6.589602   
min    -122.513642    37.707922     0.000000     1.000000      0.016667   
25%    -122.433911    37.753007     9.000000     4.000000      9.500000   
50%    -122.416491    37.775421    14.000000     6.000000     14.500000   
75%    -122.407246    37.784422    19.000000     9.000000     19.000000   
max    -122.365565    37.819975    23.000000    12.000000     23.983333   

       Minutes_Since_03  Minutes_Since_New_Year          DOW  Street_Corner  
count       4999.000000             4999.000000  4999.000000    4999.000000  
mean     3258941.313863           258826.186837     3.013203       0.298060  
std      1918458.993609           150295.264468     1.953600       0.457452  
min         

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrix
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'X+Y'
x_train = dmatrix(formula_ml, data=train_df, return_type='dataframe')
#print x_train
# print y_train
#y_train = np.asarray(y_train).ravel()

alg = RandomForestClassifier()
# print y_train.shape
# # print x_train.shape
# scores1 = cross_validation.cross_val_score(alg, x_train, train_df['Category'], cv=3)
# scores2 = cross_validation.cross_val_score(alg, train_df[['X', 'Y']], train_df['Category'], cv=3)
alg.fit()

print 'Score: '
print str(scores1.mean())
print str(scores2.mean())

Score: 
0.15925902435
0.161399772597
