In [97]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from joblib import Memory
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

#TODO: check which ones are really worth encoding and which can be even dropped (some may contain nonoverlapping values between
#test and training sest)
categorical = {'Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
               'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 
               'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 
               'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 
               'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 
               'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 
               'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 
               'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 
               'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 
               'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 
               'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 
               'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 
               'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 
               'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 
#                'Medical_History_38', 
               'Medical_History_39', 'Medical_History_40', 'Medical_History_41','Medical_History_1', 
               'Medical_History_15', 'Medical_History_24', 'Medical_History_32'}

In [98]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
total = pd.concat([train, test])
median = total.median()
train.fillna(median, inplace=True)
test = test.fillna(median, inplace=True)
encoder = LabelEncoder()
for f in categorical:
    encoder.fit(total[f])
    train[f] = encoder.transform(train[f])
    test[f] = encoder.transform(test[f])

In [99]:

feature_cols = test.columns[1:]
categorical_inds = [i for i, col in enumerate(feature_cols) if col in categorical]
oh_encoder = OneHotEncoder(categorical_features=categorical_inds)



In [100]:
X = np.array(train[test.columns[1:]])
y = np.array(train.Response)
X_actual_test = np.array(test[feature_cols])

In [102]:
oh_encoder.fit(X)
X = oh_encoder.transform(X).todense()
X_actual_test = oh_encoder.transform(X_actual_test).todense()

In [103]:
train_test_folds = list(StratifiedKFold(y, n_folds=6, random_state=0))

In [106]:
train_cache = Memory(cachedir="cache/train", verbose=0)
test_cache = Memory(cachedir="cache/test", verbose=0)
train_p_proba = Memory(cachedir="cache/train_pb", verbose=0)

@train_cache.cache
def train_pred_pb(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict_proba(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])


@train_cache.cache
def train_predictions(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@test_cache.cache
def test_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)


stacker_train_cache = Memory(cachedir="cache/stacker_train", verbose=0)
stacker_test_cache = Memory(cachedir="cache/stacker_test", verbose=0)

@stacker_train_cache.cache
def stacker_train_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    
    ind2pred = {}
    for train, test in train_test_folds:
        stacker.fit(stacked_X[train], y[train])
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@stacker_test_cache.cache
def stacker_test_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    return stacker.predict(X_actual_test)



stacker_pb_train_cache = Memory(cachedir="cache/stacker_pb_train", verbose=0)
stacker_pb_test_cache = Memory(cachedir="cache/stacker_pb_test", verbose=0)

@stacker_pb_train_cache.cache
def stacker_pb_train_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_pred_pb(clf) for clf in base_clfs])
    
    ind2pred = {}
    for train, test in train_test_folds:
        stacker.fit(stacked_X[train], y[train])
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@stacker_pb_test_cache.cache
def stacker_pb_test_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_pred_pb(clf) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    return stacker.predict(X_actual_test)

In [121]:
def benchmark(model):
    pred = train_predictions(model)
    return eval_wrapper(pred, y)

def make_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)

def benchmark_stacker(model, base_clfs):
    pred = stacker_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)

def benchmark_stacker_pb(model, base_clfs):
    pred = stacker_pb_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)



In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='entropy'))

In [119]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='gini'))

KeyboardInterrupt: 

In [None]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='entropy'))

In [None]:
%%time
benchmark(ExtraTreesClassifier(n_estimators=400, criterion='entropy'))

In [None]:
%%time
benchmark(ExtraTreesClassifier(n_estimators=400, criterion='gini'))

In [None]:
%%time
benchmark(XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9))

In [None]:
%%time
benchmark(OneVsRestClassifier(SGDClassifier()))

In [None]:
%%time
benchmark_stacker(RandomForestClassifier(n_estimators=400, criterion="entropy"), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini'),
        LogisticRegression(),
        SGDClassifier()])

In [None]:
%%time
benchmark(SVC())

In [None]:
%%time
benchmark(LogisticRegression())

In [None]:
%%time
benchmark_stacker(SGDClassifier(), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini')])

In [None]:
%%time
benchmark_stacker(RandomForestClassifier(n_estimators=400, criterion="entropy"), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini'),
        LogisticRegression(),
    XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)])

In [109]:
%%time
benchmark_stacker(RandomForestClassifier(n_estimators=400, criterion="entropy"), [
        LogisticRegression()])

CPU times: user 20min 55s, sys: 4.82 s, total: 21min
Wall time: 20min 57s


0.5321576811775488

In [110]:
X.shape

(59381, 1726)

In [113]:
%%time
benchmark_stacker_pb(RandomForestClassifier(n_estimators=400, criterion="entropy"), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini'),
        LogisticRegression(),
    XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)])

CPU times: user 4h 23min 41s, sys: 16 s, total: 4h 23min 57s
Wall time: 1h 54min 53s


0.5651373638545598

In [116]:
%%bash
cat sample_submission.csv | head 

"Id","Response"
1,8
3,8
4,8
9,8
12,8
13,8
21,8
28,8
30,8


cat: write error: Broken pipe


In [122]:
model = RandomForestClassifier(n_estimators=400, criterion='gini')

df = pd.DataFrame()
df['Id'] = test.Id
df['Response'] = make_predictions(model)

In [125]:
df.to_csv("my_submission.csv", index=False)