In [7]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [8]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [9]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [22]:
train_df = preprocess(get_random_subset(train, 50000))
# train_df = preprocess(train)

In [None]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category
min_leaves = np.round_(np.logspace(1, 3.5, num=6)) # values from 10 to 3100

parameters = {'min_samples_leaf':min_leaves}

clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

for f in clf.grid_scores_:
    print f

In [23]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category

vals = y_vals.value_counts()

for index, value in vals.iteritems():
    if value < 3:
        x_vals = x_vals[y_vals != index]
        y_vals = y_vals[y_vals != index]
    

Cs = np.logspace(-4, 4, 9)

parameters = {'C':Cs}

clf = GridSearchCV(LogisticRegression(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

for f in clf.grid_scores_:
    print f

mean: -2.66714, std: 0.00102, params: {'C': 0.0001}
mean: -2.63735, std: 0.00096, params: {'C': 0.001}
mean: -2.58922, std: 0.00141, params: {'C': 0.01}
mean: -2.56882, std: 0.00159, params: {'C': 0.10000000000000001}
mean: -2.56632, std: 0.00113, params: {'C': 1.0}
mean: -2.56766, std: 0.00060, params: {'C': 10.0}
mean: -2.56952, std: 0.00059, params: {'C': 100.0}
mean: -2.57143, std: 0.00107, params: {'C': 1000.0}
mean: -2.57284, std: 0.00151, params: {'C': 10000.0}


In [None]:
for f in lg.C_:
    print f