In [31]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from joblib import Memory
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

#TODO: check which ones are really worth encoding and which can be even dropped (some may contain nonoverlapping values between
#test and training sest)
categorical = {'Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
               'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 
               'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 
               'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 
               'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 
               'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 
               'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 
               'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 
               'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 
               'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 
               'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 
               'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 
               'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 
               'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 
#                'Medical_History_38', 
               'Medical_History_39', 'Medical_History_40', 'Medical_History_41','Medical_History_1', 
               'Medical_History_15', 'Medical_History_24', 'Medical_History_32'}

In [32]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
total = pd.concat([train, test])
median = total.median()
train.fillna(median, inplace=True)
test = test.fillna(median, inplace=True)
encoder = LabelEncoder()
for f in categorical:
    encoder.fit(total[f])
    train[f] = encoder.transform(train[f])
    test[f] = encoder.transform(test[f])

In [33]:

feature_cols = test.columns[1:]
categorical_inds = [i for i, col in enumerate(feature_cols) if col in categorical]
encoder = OneHotEncoder(categorical_features=categorical_inds)



In [51]:
X = np.array(train[test.columns[1:]])
y = np.array(train.Response)
X_actual_test = np.array(test[feature_cols])


In [35]:
encoder.fit(X)
X = encoder.transform(X)
X_actual_test = encoder.transform(X_actual_test)

In [52]:
train_test_folds = list(StratifiedKFold(y, n_folds=6, random_state=0))

In [55]:
train_cache = Memory(cachedir="cache/train", verbose=0)
test_cache = Memory(cachedir="cache/test", verbose=0)
train_p_proba = Memory(cachedir="cache/train_pb", verbose=0)

@train_cache.cache
def train_pred_pb(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict_proba(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])


@train_cache.cache
def train_predictions(model):
    ind2pred = {}
    for train, test in train_test_folds:
        model.fit(X[train], y[train])
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@test_cache.cache
def test_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)


stacker_train_cache = Memory(cachedir="cache/stacker_train", verbose=0)
stacker_test_cache = Memory(cachedir="cache/stacker_test", verbose=0)

@stacker_train_cache.cache
def stacker_train_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    
    ind2pred = {}
    for train, test in train_test_folds:
        stacker.fit(stacked_X[train], y[train])
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@stacker_test_cache.cache
def stacker_test_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    return stacker.predict(X_actual_test)

In [56]:
def benchmark(model):
    pred = train_predictions(model)
    return eval_wrapper(pred, y)

def benchmark_stacker(model, base_clfs):
    pred = stacker_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)

In [57]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='gini'))

CPU times: user 2min 59s, sys: 344 ms, total: 2min 59s
Wall time: 2min 59s


0.5317997483351157

In [8]:
%%time
benchmark(RandomForestClassifier(n_estimators=200, criterion='entropy'))

CPU times: user 55 ms, sys: 8.66 ms, total: 63.7 ms
Wall time: 61.7 ms


0.5217668847898264

In [9]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='gini'))

CPU times: user 57.7 ms, sys: 4.37 ms, total: 62.1 ms
Wall time: 60.9 ms


0.531654920209772

In [10]:
%%time
benchmark(RandomForestClassifier(n_estimators=400, criterion='entropy'))

CPU times: user 54.3 ms, sys: 7.64 ms, total: 61.9 ms
Wall time: 60.8 ms


0.5208181789333791

In [11]:
%%time
benchmark(ExtraTreesClassifier(n_estimators=400, criterion='entropy'))

CPU times: user 58.6 ms, sys: 4.54 ms, total: 63.1 ms
Wall time: 61.4 ms


0.5320694468816863

In [12]:
%%time
benchmark(ExtraTreesClassifier(n_estimators=400, criterion='gini'))

CPU times: user 58.8 ms, sys: 4.12 ms, total: 62.9 ms
Wall time: 61.3 ms


0.543342637939733

In [13]:
%%time
benchmark(XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9))

CPU times: user 63 ms, sys: 0 ns, total: 63 ms
Wall time: 61 ms


0.5450288528827849

In [16]:
%%time
benchmark(OneVsRestClassifier(SGDClassifier()))

CPU times: user 3.77 s, sys: 684 ms, total: 4.45 s
Wall time: 3.73 s


0.08047617442355048

In [17]:
%%time
benchmark_stacker(RandomForestClassifier(n_estimators=400, criterion="entropy"), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini'),
        LogisticRegression(),
        SGDClassifier()])

CPU times: user 9min 40s, sys: 633 ms, total: 9min 40s
Wall time: 9min 40s


0.5498961836335924

In [None]:
%%time
benchmark(SVC())

In [17]:
%%time
benchmark(LogisticRegression())

CPU times: user 62 ms, sys: 6 µs, total: 62 ms
Wall time: 276 ms


0.5032159302745367

In [14]:
%%time
benchmark_stacker(SGDClassifier(), [
        RandomForestClassifier(n_estimators=400, criterion='gini'), 
        ExtraTreesClassifier(n_estimators=400, criterion='gini')])

CPU times: user 3.1 s, sys: 690 ms, total: 3.79 s
Wall time: 6.62 s


0.20516379124252315

In [11]:
benchmark_stacker(LogisticRegression(),)

In [21]:
c = StackedClassifier([RandomForestClassifier()])
c1 = StackedClassifier([LogisticRegression()])

In [14]:
ct = Memory(cachedir="cachetest/train", verbose=0)


@ct.cache
def kupa(clf):
    print clf
    return input()

In [24]:
kupa(c)

'dfsg'

In [18]:
small_x = X[:100]
small_y = y[:100]

In [19]:
small_x.shape

(100, 126)

In [29]:
clf = RandomForestClassifier()
clf.fit(small_x, small_y)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [40]:
stacked = np.hstack([small_x, clf.predict(small_x).reshape(100, 1)])

In [41]:
stacked.shape

(100, 127)

In [31]:
train.columns

Index([u'Id', u'Product_Info_1', u'Product_Info_2', u'Product_Info_3', u'Product_Info_4', u'Product_Info_5', u'Product_Info_6', u'Product_Info_7', u'Ins_Age', u'Ht', u'Wt', u'BMI', u'Employment_Info_1', u'Employment_Info_2', u'Employment_Info_3', u'Employment_Info_4', u'Employment_Info_5', u'Employment_Info_6', u'InsuredInfo_1', u'InsuredInfo_2', u'InsuredInfo_3', u'InsuredInfo_4', u'InsuredInfo_5', u'InsuredInfo_6', u'InsuredInfo_7', u'Insurance_History_1', u'Insurance_History_2', u'Insurance_History_3', u'Insurance_History_4', u'Insurance_History_5', u'Insurance_History_7', u'Insurance_History_8', u'Insurance_History_9', u'Family_Hist_1', u'Family_Hist_2', u'Family_Hist_3', u'Family_Hist_4', u'Family_Hist_5', u'Medical_History_1', u'Medical_History_2', u'Medical_History_3', u'Medical_History_4', u'Medical_History_5', u'Medical_History_6', u'Medical_History_7', u'Medical_History_8', u'Medical_History_9', u'Medical_History_10', u'Medical_History_11', u'Medical_History_12', u'Medical_Hi

In [30]:
set(train.Product_Info_8)

AttributeError: 'DataFrame' object has no attribute 'Product_Info_8'

In [33]:
train.Employment_Info_2

0     12
1      1
2      9
3      9
4      9
5     15
6      1
7     12
8      9
9      1
10     9
11     3
12     9
13     9
14     3
...
59366    14
59367     9
59368     9
59369     1
59370     9
59371    12
59372    14
59373     9
59374     9
59375     9
59376     1
59377     9
59378     9
59379     9
59380     9
Name: Employment_Info_2, Length: 59381, dtype: int64

In [40]:
import pandas as pd 
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# sample_submission = pd.read_csv("../input/sample_submission.csv")

#We transform categorical values to dummies 0/1

categorical = ['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
               'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 
               'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 
               'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 
               'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 
               'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 
               'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 
               'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 
               'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 
               'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 
               'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 
               'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 
               'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 
               'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 'Medical_History_38', 
               'Medical_History_39', 'Medical_History_40', 'Medical_History_41','Medical_History_1', 
               'Medical_History_15', 'Medical_History_24', 'Medical_History_32']

from sklearn.feature_extraction import DictVectorizer
def one_hot_dataframe(data, cols, replace=False):
    """ Takes a dataframe and a list of columns that need to be encoded.
        Returns a 3-tuple comprising the data, the vectorized data,
        and the fitted vectorizor.
    """
    vec = DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return (data, vecData, vec)
    
train_ohd,_,_=one_hot_dataframe(train,categorical,replace=True)
test_ohd,_,_=one_hot_dataframe(test,categorical,replace=True)

AttributeError: 'builtin_function_or_method' object has no attribute 'iteritems'

In [65]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categorical_features=[0])
enc.fit(X)
new_X = enc.transform(X)

In [66]:
X.shape

(59381, 126)

In [67]:
new_X.shape

(59381, 127)

In [57]:
%%bash
ls

download.sh
keras.ipynb
oldcache
sample_submission.csv
test.csv
train.csv
training.ipynb
xgboost.ipynb


In [60]:
set(train.Product_Info_1)

{1, 2}

In [64]:
set(X[:,0])

{1.0, 2.0}