In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from patsy import dmatrix, dmatrices
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.cross_validation import StratifiedKFold

In [2]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))
mapdata = np.loadtxt(os.path.join(cur_dir, "data", "sf_map.txt"))

In [3]:
def get_random_subset(df, n=None):
    if (n is None):
        return df
    sub = random.sample(xrange(len(df)), min(n, len(df)))
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    res['Street_Corner'] = res['Address'].apply(lambda x: 1 if '/' in x else 0)
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def isNight(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 19, 20, 21, 22, 23]:
        return "Night"
    else:
        return "Day"

In [22]:
train_df = preprocess(get_random_subset(train))
# train_df = preprocess(train)

In [5]:
def train_models(models, x_train, y_train):
    for m in models:
        m.fit(x_train, y_train)
        
def get_x_y_matrices(df, formula, target):
    
    x_vals = dmatrix(formula, data=df, return_type='dataframe')
    y_vals = df[target]

    return filter_infrequent(x_vals, y_vals)
    
def filter_infrequent(x, y, threshold=3):
    counts = y.value_counts()
    for cat, count in counts.iteritems():
        if count < 3:
            x = x[y != cat]
            y = y[y != cat]
            
    return x, y

In [23]:
class Weighted_Averager(BaseEstimator):
    
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights
        self.probas = [len(models)]
        
    def fit(self, x, y):
        for m in self.models:
            m.fit(x, y)
            
    def predict_probas_cv(self, x, y):
        probas = [None] * len(self.models)
        for i, m in enumerate(self.models):
            probas[i] = cross_val_predict(m, x, y)
        self.probas = probas
            
    def calc_log_loss(self, y, weights=None):
        if (weights is None):
            weights = self.weights
        res = None
        for p, w in zip(self.probas, weights):
            if res is None:
                res = w * p
            else:
                res += w * p
        return log_loss(y, res)
            
    def predict_proba(self, x):
        res = None
        for m, w in zip(self.models, self.weights):
            if res is None:
                res = w * m.predict_proba(x)
            else:
                res += w * m.predict_proba(x)
                
        return res
    
class ProbaRandomForestClassifier(RandomForestClassifier):
    def predict(self, X):
        return self.predict_proba(X)
        
class ProbaLogisticRegression(LogisticRegression):
    def predict(self, X):
        return self.predict_proba(X)

In [24]:
formula_ml = 'C(DayOfWeek) + C(PdDistrict) + Street_Corner + X+Y+Hour+Month'

x_vals, y_vals = get_x_y_matrices(train_df, formula_ml, 'Category')

clf_forest = ProbaRandomForestClassifier(min_samples_leaf=300)
clf_logistic = ProbaLogisticRegression()

# train_models([clf_forest, clf_logistic], X_train, y_train)

In [None]:
w_a = Weighted_Averager([clf_forest, clf_logistic], [0.5, 0.5])
w_a.predict_probas_cv(x_vals, y_vals)
weights = np.linspace(0.1, 0.9, 9)
scores = []
for w in weights:
    s = w_a.calc_log_loss(y_vals, weights=[w, 1-w])
    scores.append(s)
    
plt.plot(weights, scores)