In [72]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from joblib import Memory

from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

In [37]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

encoder = LabelEncoder()
train['Product_Info_2'] = encoder.fit_transform(train['Product_Info_2'])
test['Product_Info_2'] = encoder.fit_transform(test['Product_Info_2'])

train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True);

In [38]:
X = np.array(train[test.columns[1:]])
y = train.Response
X_actual_test = np.array(test[test.columns[1:]])

In [39]:
train_test_folds = list(StratifiedKFold(y, n_folds=6, random_state=0))

In [65]:
train_cache = Memory(cachedir="cache/train", verbose=0)
test_cache = Memory(cachedir="cache/test", verbose=0)

@train_cache.cache
def train_predictions(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@test_cache.cache
def test_predictions(model):
    model.fit(X, y)
    return model.fit(X_actual_test)


In [70]:
def benchmark(model):
    pred = train_predictions(model)
    return np.mean(pred == y)

In [69]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='gini'))

CPU times: user 11.9 ms, sys: 7.98 ms, total: 19.9 ms
Wall time: 15.4 ms


0.34647446152809819

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='entropy'))

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='gini'))

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='entropy'))

In [82]:
%%time
benchmark(XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9))

CPU times: user 6.25 ms, sys: 4.11 ms, total: 10.4 ms
Wall time: 7.91 ms


0.58112864384230645

In [None]:
%%time
benchmark(SVC())

In [44]:
%%time
benchmark(LogisticRegression(), X, y)

Wall time: 7min 11s


0.50381380405400522

In [None]:
%%time
benchmark(SVC(), X, y)

In [5]:
model = SVC()

In [None]:
%%time
n = 40000
model.fit(X[:n], y[:n])

In [8]:
len(X)

59381

In [25]:
memory = Memory(cachedir="masze", verbose=0)

In [26]:
@memory.cache
def f(n):
    x = input()
    return str(n) +" "+x

In [31]:
f(RandomForestClassifier(n_estimators=11))

"Sdfg"


"RandomForestClassifier(bootstrap=True, compute_importances=None,\n            criterion='gini', max_depth=None, max_features='auto',\n            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,\n            min_samples_split=2, n_estimators=11, n_jobs=1,\n            oob_score=False, random_state=None, verbose=0) Sdfg"

In [77]:
RandomForestClassifier?

In [22]:
x = input()

fgjh


NameError: name 'fgjh' is not defined

In [13]:
model = RandomForestClassifier()

In [14]:
print model

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)


In [16]:
yyy = np.array([0,1,2,3,3,2,1,0,0,0,0,1,1,1,2,2,2,3,3,3])

In [19]:
for train, test in StratifiedKFold(yyy, n_folds=3):
    print train
    print test
    print "--"

[ 8  9 10 11 12 13 14 15 16 17 18 19]
[0 1 2 3 4 5 6 7]
--
[ 0  1  2  3  4  5  6  7 10 13 16 19]
[ 8  9 11 12 14 15 17 18]
--
[ 0  1  2  3  4  5  6  7  8  9 11 12 14 15 17 18]
[10 13 16 19]
--


In [33]:
list(StratifiedKFold(yyy, n_folds=3))

[(array([ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
  array([0, 1, 2, 3, 4, 5, 6, 7])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 13, 16, 19]),
  array([ 8,  9, 11, 12, 14, 15, 17, 18])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18]),
  array([10, 13, 16, 19]))]

In [80]:
XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=80, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.85)

In [76]:
%%time 
n = 10000
XGBClassifier().fit(X[:n], y[:n])

CPU times: user 42.4 s, sys: 171 ms, total: 42.5 s
Wall time: 13.9 s
