In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB

In [2]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [3]:
def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [4]:
train_df = preprocess(get_random_subset(train, len(train)))

In [None]:
print train_df.describe()

In [None]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'X+Y+Hour'
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
# print x_train
# y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
# y_validation = y_validation[y_validation.isin(y_train.values)]
# mlb = MultiLabelBinarizer(classes=alg.classes_)
# print y_validation
# y_validation = mlb.fit_transform(np.array([y_validation]).T)

num_trees = [5, 10, 50, 250]
min_leaves = [10, 50, 500, 2500, 10000, 50000]

for trees in num_trees:
    scores = []
    for l in min_leaves:
        alg = RandomForestClassifier(min_samples_leaf=l)
        alg.fit(x_train, y_train)
        # alg = BernoulliNB()
        y_validation = validation.Category
        y_validation = y_validation[y_validation.isin(y_train.values)]
        mlb = MultiLabelBinarizer(classes=alg.classes_)
        y_validation = mlb.fit_transform(np.array([y_validation]).T)

        predictions = np.array(alg.predict_proba(x_validation))
        scores.append(log_loss(y_validation, predictions))
    #     print "Min leaf " + str(l) + ": " + str(log_loss(y_validation, predictions))
    plt.plot(min_leaves, scores, label=(str(trees) + " trees"))
plt.legend()
plt.gca().set_xscale('log')

In [None]:
training, validation = train_test_split(train_df, train_size=.60)

formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'
x_train = dmatrix(formula_ml, data=training, return_type='dataframe')
# print x_train
# y_train = training.Category

x_validation = dmatrix(formula_ml, data=validation, return_type='dataframe')
y_validation = validation.Category

x_validation = x_validation[y_validation.isin(y_train.values)]
# y_validation = y_validation[y_validation.isin(y_train.values)]
# mlb = MultiLabelBinarizer(classes=alg.classes_)
# print y_validation
# y_validation = mlb.fit_transform(np.array([y_validation]).T)

weights = np.linspace(0.7, 1, 10)
scores = []

for w in weights:
    alg1 = RandomForestClassifier(min_samples_leaf=1000)
    alg2 = BernoulliNB()
    alg1.fit(x_train, y_train)
    alg2.fit(x_train, y_train)
    # alg = BernoulliNB()
    y_validation = validation.Category
    y_validation = y_validation[y_validation.isin(y_train.values)]
    mlb = MultiLabelBinarizer(classes=alg1.classes_)
    y_validation = mlb.fit_transform(np.array([y_validation]).T)

    predictions1 = np.array(alg1.predict_proba(x_validation))
    predictions2 = np.array(alg2.predict_proba(x_validation))
    predictions = (w * predictions1 + (1-w) * predictions2)
    score = log_loss(y_validation, predictions)
    scores.append(score)
    #     print "Min leaf " + str(l) + ": " + str(log_loss(y_validation, predictions))
plt.plot(weights, scores)
plt.xlabel("Percentage forest")
plt.ylabel("score")
# plt.gca().set_xscale('log')

In [5]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals = dmatrix(formula_ml, data=train_df, return_type='dataframe')
y_vals = train_df.Category

min_leaves = [10, 50, 500, 2500, 10000, 50000]

parameters = {'min_samples_leaf':min_leaves}

clf = GridSearchCV(RandomForestClassifier(), parameters, scoring='log_loss')

clf.fit(x_vals, y_vals)

print clf.grid_scores_

[mean: -2.95541, std: 0.03623, params: {'min_samples_leaf': 10}, mean: -2.48741, std: 0.00330, params: {'min_samples_leaf': 50}, mean: -2.46742, std: 0.00162, params: {'min_samples_leaf': 500}, mean: -2.50952, std: 0.00074, params: {'min_samples_leaf': 2500}, mean: -2.55335, std: 0.00147, params: {'min_samples_leaf': 10000}, mean: -2.62467, std: 0.00877, params: {'min_samples_leaf': 50000}]
