In [24]:
'''
Purpose: This script tries to implement a technique called stacking/blending/stacked generalization.

See: https://github.com/emanuele/kaggle_pbr/blob/master/blend.py
     http://mlwave.com/kaggle-ensembling-guide/ 
     https://github.com/MLWave/Kaggle-Ensemble-Guide

'''

from __future__ import division
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd


In [3]:
### Pick your paramters
n_folds = 5
verbose = True
shuffle = False


In [25]:
### Load some data
from sklearn.datasets import load_digits 
digits = load_digits()
print(digits.data.shape)


(1797, 64)


In [4]:
pd.DataFrame(digits['target']).head()


Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4


In [5]:
pd.DataFrame(digits['data']).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,0,6,13,10,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,0,11,16,10,0,0
2,0,0,0,4,15,12,0,0,0,0,...,5,0,0,0,0,3,11,16,9,0
3,0,0,7,15,13,1,0,0,0,8,...,9,0,0,0,7,13,13,9,0,0
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,0,2,16,4,0,0


In [6]:
### Just break it up to some train/test set
X, y = digits['data'], digits['target']
X, X_submission = X[0:1300],X[1300:]
y, y_submission_actual = y[0:1300],y[1300:]

In [26]:
print np.shape(X)
print np.shape(X_submission)
print np.shape(y)

(1300, 64)
(497, 64)
(1300,)


In [8]:
### Can introduce another round of randomness by shuffleing around the indices
if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]
    

In [9]:
## Returns train/test inidices for n_folds using StratifiedKFold
skf = list(StratifiedKFold(y, n_folds))
skf


[(array([ 256,  258,  264, ..., 1297, 1298, 1299]),
  array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
         156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 1

In [10]:
## Create a list of models to run
clfs = [RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(subsample=0.5, max_depth=6, n_estimators=10)]

print "Creating train and test sets for blending."


Creating train and test sets for blending.


In [11]:
### Pre-allocate the data
### For each model, add a column with N rows for each model
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))


In [13]:
print np.shape(dataset_blend_train)
print np.shape(dataset_blend_test)
pd.DataFrame(dataset_blend_train).head()


(1300, 5)
(497, 5)


Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [14]:
### Loop over the models
for j, clf in enumerate(clfs):
    print j, clf
    
    ### Create a tempory array that is (Holdout_Size, N_Folds).
    ### Number of testing data x Number of folds , we will take the mean of the predictions later
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    print np.shape(dataset_blend_test_j)
    
    ### Loop over the folds
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        
        ### Extract and fit the train/test section for each fold
        X_train = X[train]
        y_train = y[train]
        X_test  = X[test]
        y_test  = y[test]
        clf.fit(X_train, y_train)
        
        ### Predict the probability of current folds test set and store results.
        ### This output will be the basis for our blended classifier to train against,
        ### which is also the output of our classifiers
        dataset_blend_train[test, j] = clf.predict_proba(X_test)[:,1]
        
        ### Predict the probabilty for the holdout set and store results
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]

    ### Take mean of final holdout set folds
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
    

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
(497, 5)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
(497, 5)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
2 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_

In [15]:
pd.DataFrame(dataset_blend_test).head()


Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.022828
1,0.02,0.02,0.0,0.02,0.049683
2,0.0,0.0,0.0,0.0,0.02501
3,0.12,0.12,0.08,0.12,0.104259
4,0.0,0.04,0.0,0.0,0.020743


In [16]:
pd.DataFrame(dataset_blend_train).head()


Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.021835
1,0.7,0.8,0.8,0.9,0.863027
2,0.3,0.1,0.1,0.2,0.030591
3,0.0,0.0,0.0,0.0,0.027669
4,0.0,0.1,0.1,0.0,0.072673


In [18]:
### We now have a new dataset with dimensions (N_train X N_models)
### Fit a logistic regression and predict on blended holdout set
print "Blending...."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]


Blending....


In [20]:
pd.DataFrame(y_submission).head()


Unnamed: 0,0
0,0.007378
1,0.00976
2,0.007434
3,0.035971
4,0.008266


In [22]:
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())


Linear stretch of predictions to [0,1]


In [23]:
pd.DataFrame(y_submission).head()


Unnamed: 0,0
0,0.000164
1,0.002936
2,0.000229
3,0.033435
4,0.001197
