In [53]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from joblib import Memory

from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

encoder = LabelEncoder()
train['Product_Info_2'] = encoder.fit_transform(train['Product_Info_2'])
test['Product_Info_2'] = encoder.fit_transform(test['Product_Info_2'])

train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True);

In [3]:
X = np.array(train[test.columns[1:]])
y = train.Response
X_actual_test = np.array(test[test.columns[1:]])

In [4]:
train_test_folds = list(StratifiedKFold(y, n_folds=6, random_state=0))

In [49]:
train_cache = Memory(cachedir="cache/train", verbose=0)
test_cache = Memory(cachedir="cache/test", verbose=0)
train_p_proba = Memory(cachedir="cache/train_pb", verbose=0)

@train_cache.cache
def train_pred_pb(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict_proba(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])


@train_cache.cache
def train_predictions(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@test_cache.cache
def test_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)


stacker_train_cache = Memory(cachedir="cache/stacker_train", verbose=0)
stacker_test_cache = Memory(cachedir="cache/stacker_test", verbose=0)

@stacker_train_cache.cache
def stacker_train_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    
    ind2pred = {}
    for train, test in train_test_folds:
        stacker.fit(stacked_X[train], y[train])
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@stacker_test_cache.cache
def stacker_test_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    return stacker.predict(X_actual_test)

In [50]:
def benchmark(model):
    pred = train_predictions(model)
    return eval_wrapper(pred, y)

def benchmark_stacker(model, base_clfs):
    pred = stacker_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)

In [9]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='gini'))

CPU times: user 53.6 ms, sys: 7.99 ms, total: 61.6 ms
Wall time: 60.2 ms


0.5314921540324228

In [10]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='entropy'))

CPU times: user 4min 54s, sys: 172 ms, total: 4min 54s
Wall time: 4min 54s


0.5217668847898264

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='gini'))

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='entropy'))

In [52]:
%%time
benchmark(XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9))

CPU times: user 61.1 ms, sys: 13 µs, total: 61.1 ms
Wall time: 394 ms


0.5450288528827849

In [None]:
%%time
benchmark(SVC())

In [44]:
%%time
benchmark(LogisticRegression())

Wall time: 7min 11s


0.50381380405400522

In [51]:
%%time
benchmark_stacker(SGDClassifier(), [RandomForestClassifier(n_estimators=200, criterion='gini')])

CPU times: user 3.18 s, sys: 1.91 s, total: 5.09 s
Wall time: 3.04 s


0.203645064116726

In [11]:
benchmark_stacker(LogisticRegression(),)

In [21]:
c = StackedClassifier([RandomForestClassifier()])
c1 = StackedClassifier([LogisticRegression()])

In [14]:
ct = Memory(cachedir="cachetest/train", verbose=0)


@ct.cache
def kupa(clf):
    print clf
    return input()

In [24]:
kupa(c)

'dfsg'

In [28]:
small_x = X[:100]
small_y = y[:100]

In [27]:
small_x.shape

(100, 126)

In [29]:
clf = RandomForestClassifier()
clf.fit(small_x, small_y)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [40]:
stacked = np.hstack([small_x, clf.predict(small_x).reshape(100, 1)])

In [41]:
stacked.shape

(100, 127)

In [39]:
clf.predict(small_x).reshape(100, 1)

array([[8],
       [8],
       [8],
       [8],
       [8],
       [8],
       [8],
       [1],
       [8],
       [1],
       [6],
       [2],
       [7],
       [3],
       [8],
       [5],
       [8],
       [7],
       [2],
       [8],
       [8],
       [5],
       [5],
       [8],
       [6],
       [8],
       [6],
       [8],
       [7],
       [6],
       [4],
       [7],
       [7],
       [5],
       [2],
       [7],
       [8],
       [1],
       [1],
       [6],
       [6],
       [2],
       [5],
       [7],
       [2],
       [7],
       [6],
       [8],
       [5],
       [2],
       [2],
       [7],
       [6],
       [4],
       [7],
       [5],
       [2],
       [7],
       [8],
       [6],
       [8],
       [7],
       [3],
       [5],
       [8],
       [6],
       [8],
       [7],
       [7],
       [6],
       [6],
       [8],
       [5],
       [7],
       [6],
       [7],
       [6],
       [6],
       [6],
       [5],
       [4],
       [1],
       [7],
    

In [42]:
clf.predict_proba(small_x)

array([[ 0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0.9],
       [ 0. ,  0. ,  0. ,  0.4,  0.1,  0. ,  0. ,  0.5],
       [ 0. ,  0.1,  0. ,  0. ,  0. ,  0.1,  0.1,  0.7],
       [ 0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0.1,  0.8],
       [ 0. ,  0. ,  0. ,  0.1,  0. ,  0.1,  0.1,  0.7],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0.1,  0.8],
       [ 0. ,  0. ,  0. ,  0. ,  0.1,  0.1,  0. ,  0.8],
       [ 0.8,  0. ,  0. ,  0. ,  0.1,  0. ,  0.1,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ],
       [ 0.5,  0. ,  0. ,  0. ,  0.1,  0.1,  0.2,  0.1],
       [ 0.1,  0. ,  0. ,  0. ,  0. ,  0.7,  0.1,  0.1],
       [ 0. ,  0.7,  0. ,  0. ,  0.1,  0. ,  0.1,  0.1],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.7,  0.3],
       [ 0.2,  0. ,  0.6,  0.1,  0. ,  0.1,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  0.7,  0.2,  0. ,  0.1],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,