In [7]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils import np_utils

import xgboost as xgb

import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [50]:
def get_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')

    return train, test

def one_hot(train_df, test_df):
    
    ntrain = len(train_df)
    
    # Remove useless variables
    for c in train_df.columns.values[10::]:
        if train_df[c].value_counts()[0] == ntrain:
            train_df.drop(c, axis=1, inplace=True)
            test_df.drop(c, axis=1, inplace=True)
    
    all_df = pd.concat([train_df, test_df])
    all_df = all_df.set_index('ID')
    
    cat = [] 
    for c in all_df.columns:
        if all_df[c].dtype == 'object':
            cat.append(c)
    print (cat)
            
    dummies = pd.get_dummies(all_df[cat])
    all_df.drop(cat, axis=1, inplace=True)
    all_df = pd.concat([all_df, dummies], axis=1)
        
    train_df = all_df.iloc[0:ntrain]
    test_df = all_df.iloc[ntrain::].drop('y', axis=1)
    
    return train_df, test_df

In [51]:
train_df, test_df = get_data()
train_df, test_df = one_hot(train_df, test_df)

y = train_df['y'].values
train_df.drop('y', axis=1, inplace=True)

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [13]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        self.kbest = params.pop('kbest')
        self.label = params.pop('label')
        self.clf = clf(**params)

    def fit(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.nrounds = params.pop('nrounds')
        self.kbest = params.pop('kbest')
        self.label = params.pop('label')
        self.gbdt = None

    def fit(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        watchlist = [(dtrain, 'train')]
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds, watchlist, verbose_eval=False)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
class KerasWrapper(object):
    def __init__(self, model, kbest, label):
        self.clf = model
        self.kbest = kbest
        self.label = label

    def fit(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

### Decide Hyperparameters (I chose these as quick as possible after some cv experimentation)

In [14]:
xgb_params1 = {
    'colsample_bytree': 0.3,
    'silent': 1,
    'subsample': 0.8,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'min_child_weight': 1,
    'gamma':.055,
    'eval_metric': 'rmse',
    'nrounds': 500,
    'kbest':380,
    'label':'GBM 1'
}

rf_params1 = {
    'n_jobs': -1,
    'n_estimators':100,
    'max_features': 'sqrt',
    'max_depth': 40,
    'min_samples_split':10,
    'min_samples_leaf':5,
    'min_samples_leaf': 1,
    'kbest':380,
    'label':'Random Forest 1'
}

et_params1 = {
        'n_jobs': -1,
        'n_estimators':100,
        'max_features': 'sqrt',
        'max_depth': 50,
        'min_samples_split':10,
        'min_samples_leaf':5,
        'kbest':350,
        'label':'Extra Randomized Trees 1'
}

lr_params1 = {
    'label': 'Linear Regression 1',
    'kbest':410
}

KBEST_KERAS = 400
def build_keras():
    
    # number of nodes need at first layer of nn
    input_dims = KBEST_KERAS
    
    model = Sequential()
    model.add(Dense(input_dims, input_dim=input_dims, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Activation("linear"))
    # Hidden layer
    model.add(Dense(input_dims//2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.75))
    model.add(Activation("linear"))
    # Hidden layer
    model.add(Dense(input_dims//2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.75))
    model.add(Activation("linear"))
    # Hidden layer
    model.add(Dense(input_dims//2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.75))
    model.add(Activation("linear"))
    # Hidden layer adds linear output
    model.add(Dense(input_dims//2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.75))
    model.add(Activation("linear"))
    # Output Layer.
    model.add(Dense(1))
    
    model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['mse'])
    
    return model

### Initialize models

In [15]:
NFOLDS = 5

xg1 = XgbWrapper(params=xgb_params1)
keras_nn = KerasWrapper(KerasRegressor(build_fn=build_keras, epochs=75, 
        batch_size=20, verbose=False), kbest=KBEST_KERAS, label='Nueral Net 1')
et1 = SklearnWrapper(clf=ExtraTreesRegressor, params=et_params1)
rf1 = SklearnWrapper(clf=RandomForestRegressor, params=rf_params1)
lr1 = SklearnWrapper(clf=LinearRegression, params=lr_params1)

regs = [et1, xg1, keras_nn, rf1]

### preprocess for NN or Linear Regression (unecessary b/c all our values are currently 0 and 1)

In [16]:
sc = StandardScaler()
train_df = sc.fit_transform(train_df)
test_df = sc.fit_transform(test_df)

### Level 1

In [17]:
level_one_train = np.zeros((train_df.shape[0], len(regs)))
level_one_test = np.zeros((test_df.shape[0], len(regs)))

kf = KFold(NFOLDS, shuffle=True)
for j, reg in enumerate(regs):
        cv_scores = []
        dataset_blend_test = np.zeros((test_df.shape[0], NFOLDS))
        
        # train and test are indexes
        for i, (train_index, test_index) in enumerate(kf.split(train_df)):
            print("Fold {}".format(i+1))
            x_train, x_test = train_df[train_index], train_df[test_index]
            y_train, y_test = y[train_index], y[test_index]

            #ftest feature selection within fold
            skb = SelectKBest(mutual_info_regression, k=reg.kbest)
            x_train = skb.fit_transform(x_train, y_train)
            x_test = skb.transform(x_test)
            bestk_test_df = skb.transform(test_df)
            
            reg.fit(x_train, y_train)
            y_submission = reg.predict(x_test)
            #track scores of 'validation' sets
            cv_scores.append(r2_score(y_test, y_submission))
            # predict on all of the test data
            test_preds = reg.predict(bestk_test_df)
            # store predictions from full test data set
            dataset_blend_test[:, i] = test_preds
            # fill in train set with predictions from other 80% of data
            level_one_train[test_index, j] = y_submission

        print('{} scored {}'.format(reg.label, np.mean(cv_scores)))

        level_one_test[:, j] = dataset_blend_test.mean(1)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Extra Randomized Trees 1 scored 0.5488133609858593
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
GBM 1 scored 0.5671183098292236
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Nueral Net 1 scored 0.5285278860818452
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Random Forest 1 scored 0.5412753362743201


When were trying to combine each others models near the end of the comp we would save the above level_one_train and level_one_test along with their local cross validation score and public score on the kaggle leaderboard. 

# Level 2

Ideally we would do several second level models, then just a few NN's and/or GBM's on the third level then just average their predictions on the fourth and final level to get our final results. However, that would take a while so I just used a GBM to create final predictions because I don't feel like tuning a bunch of models we won't want to use in the end.

In [47]:
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', -r2_score(labels, preds)

xgb_params = {
    'eta': 0.01,
    'max_depth': 1,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'silent': 1
}

dtrain = xgb.DMatrix(level_one_train, y)
dtest = xgb.DMatrix(level_one_test)

best_rounds = xgb.cv(xgb_params, dtrain, num_boost_round=1000000, nfold=5, 
                     feval=xgb_r2_score, early_stopping_rounds=10, verbose_eval=25)

[0]	train-r2:61.1731+1.30461	test-r2:61.6841+4.87426
[25]	train-r2:36.8449+0.787824	test-r2:37.1529+2.9167
[50]	train-r2:22.1139+0.474696	test-r2:22.3008+1.74412
[75]	train-r2:13.1943+0.283595	test-r2:13.3074+1.0388
[100]	train-r2:7.79093+0.168902	test-r2:7.85801+0.608648
[125]	train-r2:4.51435+0.0986645	test-r2:4.55475+0.351342
[150]	train-r2:2.5255+0.0563797	test-r2:2.55001+0.198071
[175]	train-r2:1.31682+0.0309934	test-r2:1.33204+0.108697
[200]	train-r2:0.582201+0.0166953	test-r2:0.591885+0.0601057
[225]	train-r2:0.135672+0.0102629	test-r2:0.142242+0.0409123
[250]	train-r2:-0.136728+0.0093062	test-r2:-0.131774+0.0389819
[275]	train-r2:-0.303112+0.0104391	test-r2:-0.299157+0.0420503
[300]	train-r2:-0.405012+0.0116387	test-r2:-0.401401+0.0452696
[325]	train-r2:-0.46758+0.0125893	test-r2:-0.463922+0.0473694
[350]	train-r2:-0.506259+0.0131183	test-r2:-0.502405+0.0486951
[375]	train-r2:-0.530367+0.0134596	test-r2:-0.526184+0.0494519
[400]	train-r2:-0.545504+0.0136814	test-r2:-0.541049+0.

In [54]:
# Tuned the number of trees to grow above with cv
# Now I need to increase # of trees by 20% b/c I used 80% training / 20% validation
num_boost_rounds = int(best_rounds.shape[0]*1.2)

In [58]:
final_model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds, verbose_eval=False)
final_preds = final_model.predict(dtest)

test_df = pd.read_csv('../input/test.csv')
sub = pd.DataFrame()
sub['ID'] = test_df['ID']
sub['y'] = final_preds
sub = sub.set_index('ID')
sub.to_csv('../output/first_stack.csv')

Leaderboard score improves by about .003 or top 55% to 43% which is decent but could be improved significantly by improving base models accuracy and diversity