In [49]:
'''
[Model Stacking -- CV]


Overivew: Using CV on the training data to generate new meta feeatures
for the training and test set. The output is used as the input to the final model

Purpose: Two stage model. We use multiple classifiers at the base-stage to generate additional (prediction) features. 
A single meta-classifier is then used to make a final prediction


See: https://github.com/emanuele/kaggle_pbr/blob/master/blend.py
     http://mlwave.com/kaggle-ensembling-guide/ 
     https://github.com/MLWave/Kaggle-Ensemble-Guide

'''

from __future__ import division
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.metrics import accuracy_score, auc, roc_curve, roc_auc_score 

In [18]:
### Pick your paramters
n_folds = 4
verbose = True
shuffle = False

In [19]:
### Load some data
X, y = make_classification(n_samples=2000, n_features=20, n_informative=12, 
                           n_redundant=2, n_repeated=0, n_classes=2)
X, X_submission = X[0:1300],X[1300:]
y, y_submission_actual = y[0:1300],y[1300:]


In [34]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.587458,2.312875,-1.795392,2.070093,0.801752,-2.002577,-1.73209,-0.570551,-0.052151,1.235671,-0.8489,-1.413777,0.630016,-1.049096,-1.999829,-3.107775,-0.256758,-0.150798,2.592104,-5.70415
1,0.371422,1.277746,1.291201,-1.48495,3.532555,-3.019542,-2.833346,-1.703076,1.685095,-1.029635,-0.826717,-2.870259,0.128208,0.671259,-1.659256,0.312633,1.260473,-0.259492,3.438935,0.815347
2,4.526131,2.704997,3.61039,-3.143306,-5.375422,-1.416968,-2.041065,-1.967601,1.186167,-1.532093,0.802911,0.671255,0.828644,-0.860931,0.511966,2.398668,1.907236,-0.5498,0.293058,9.708642
3,-0.302004,-0.527073,-2.546504,-2.752022,6.53397,-1.704109,0.155528,-2.996518,0.31871,-2.268114,-0.544799,1.665569,0.605971,-1.500614,-2.761442,0.210916,2.985868,-0.892415,3.704173,-0.891367
4,1.545038,-1.035951,3.783695,-0.708187,-3.243478,-0.012637,-1.327421,1.90075,-1.582615,-0.425066,-0.632326,3.039312,-1.456517,-1.861158,0.80957,-1.627826,-0.284479,-0.659042,2.986714,6.540541


In [20]:
print np.shape(X)
print np.shape(X_submission)
print np.shape(y)

(1300, 20)
(700, 20)
(1300,)


In [21]:
### Can introduce another round of randomness by shuffleing around the indices
if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]
    

In [22]:
## Returns train/test inidices for n_folds using StratifiedKFold
skf = list(StratifiedKFold(y, n_folds))
#skf

In [23]:
## Create a list of models to run
clfs = [RandomForestClassifier(n_estimators=10,   n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=10,   n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=10,     n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=10,     n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(subsample=0.5, max_depth=6, n_estimators=10)]

print "Creating train and test sets for blending."

Creating train and test sets for blending.


In [24]:
### Pre-allocate the data
### For each model, add a column with N rows for each model
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

In [25]:
print np.shape(dataset_blend_train)
print np.shape(dataset_blend_test)
pd.DataFrame(dataset_blend_train).head()


(1300, 5)
(700, 5)


Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [26]:
### Loop over the models
for j, clf in enumerate(clfs):
    print j, clf
    
    ### Create a tempory array that is (Holdout_Size, N_Folds).
    ### Number of testing data x Number of folds , we will take the mean of the predictions later
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    print np.shape(dataset_blend_test_j)
    
    ### Loop over the folds
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        
        ### Extract and fit the train/test section for each fold
        X_train = X[train]
        y_train = y[train]
        X_test  = X[test]
        y_test  = y[test]
        clf.fit(X_train, y_train)
        
        ### Predict the probability of current folds test set and store results.
        ### This output will be the basis for our blended classifier to train against,
        ### which is also the output of our classifiers
        dataset_blend_train[test, j] = clf.predict_proba(X_test)[:,1]
        
        ### Predict the probabilty for the holdout set and store results
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]

    ### Take mean of final holdout set folds
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
    

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
(700, 4)
Fold 0
Fold 1
Fold 2
Fold 3
1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
(700, 4)
Fold 0
Fold 1
Fold 2
Fold 3
2 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
     

In [30]:
print pd.DataFrame(dataset_blend_test).shape
pd.DataFrame(dataset_blend_test).head()


(700, 5)


Unnamed: 0,0,1,2,3,4
0,0.3,0.25,0.325,0.325,0.26513
1,0.35,0.175,0.3,0.225,0.238063
2,0.175,0.125,0.2,0.025,0.185415
3,0.8,0.95,0.8,0.825,0.726178
4,0.2,0.1,0.225,0.3,0.295455


In [31]:
print pd.DataFrame(dataset_blend_train).shape
pd.DataFrame(dataset_blend_train).head()


(1300, 5)


Unnamed: 0,0,1,2,3,4
0,0.3,0.6,0.7,0.7,0.559007
1,0.8,0.9,1.0,1.0,0.802332
2,0.8,0.8,0.7,0.7,0.634365
3,0.9,0.8,0.9,0.9,0.614409
4,0.3,0.5,0.5,0.3,0.431464


In [14]:
### We now have a new dataset with dimensions (N_train X N_models)
### Fit a logistic regression and predict on blended holdout set
print "Blending...."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]

Blending....


In [15]:
pd.DataFrame(y_submission).head()

Unnamed: 0,0
0,0.097907
1,0.726669
2,0.297982
3,0.963579
4,0.059891


In [16]:
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

Linear stretch of predictions to [0,1]


In [17]:
pd.DataFrame(y_submission).head()

Unnamed: 0,0
0,0.090006
1,0.729737
2,0.293571
3,0.97078
4,0.051327


In [51]:
print "AUC Score:",roc_auc_score(y_submission_actual, y_submission)

AUC Score: 0.510005801649


In [29]:
###############################################################
#
# We can expand this a bit by adding in a 2 stage model as well.
#
###############################################################

In [35]:
### Combined the orginal features with the model predictions for training set
X_meta = np.column_stack((X, dataset_blend_train))
print X_meta.shape
pd.DataFrame(X_meta).head()

(1300, 25)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-0.587458,2.312875,-1.795392,2.070093,0.801752,-2.002577,-1.73209,-0.570551,-0.052151,1.235671,...,-3.107775,-0.256758,-0.150798,2.592104,-5.70415,0.3,0.6,0.7,0.7,0.559007
1,0.371422,1.277746,1.291201,-1.48495,3.532555,-3.019542,-2.833346,-1.703076,1.685095,-1.029635,...,0.312633,1.260473,-0.259492,3.438935,0.815347,0.8,0.9,1.0,1.0,0.802332
2,4.526131,2.704997,3.61039,-3.143306,-5.375422,-1.416968,-2.041065,-1.967601,1.186167,-1.532093,...,2.398668,1.907236,-0.5498,0.293058,9.708642,0.8,0.8,0.7,0.7,0.634365
3,-0.302004,-0.527073,-2.546504,-2.752022,6.53397,-1.704109,0.155528,-2.996518,0.31871,-2.268114,...,0.210916,2.985868,-0.892415,3.704173,-0.891367,0.9,0.8,0.9,0.9,0.614409
4,1.545038,-1.035951,3.783695,-0.708187,-3.243478,-0.012637,-1.327421,1.90075,-1.582615,-0.425066,...,-1.627826,-0.284479,-0.659042,2.986714,6.540541,0.3,0.5,0.5,0.3,0.431464


In [38]:
### Combined the orginal features with the model predictions for testing set
X_submission_meta = np.column_stack((X_submission, dataset_blend_test))
print X_submission_meta.shape
pd.DataFrame(X_submission_meta).head()

(700, 25)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.950449,-2.273414,-2.107039,-1.005434,2.268483,2.40679,1.369929,0.120819,2.143894,-0.14211,...,-0.993104,-3.441775,0.138657,3.179888,-0.184415,0.3,0.25,0.325,0.325,0.26513
1,4.066744,-1.076676,0.436699,-2.441629,-3.110171,0.184484,1.799227,-0.119878,-0.194908,1.707842,...,-1.018622,1.297823,-0.519726,1.065371,7.983361,0.35,0.175,0.3,0.225,0.238063
2,2.28991,-1.216001,6.32661,-0.27902,-9.066829,0.982662,2.42715,-1.366244,-0.304032,-0.750019,...,-1.954249,-1.130042,0.012989,1.604757,9.827646,0.175,0.125,0.2,0.025,0.185415
3,2.015615,-2.531496,1.523174,0.417231,1.243495,-2.403119,-3.699992,0.158596,0.505097,0.06598,...,-0.198233,1.519218,-1.055598,1.981167,1.36716,0.8,0.95,0.8,0.825,0.726178
4,2.25657,-4.075487,-0.184724,3.616682,-0.015257,0.885972,0.701395,0.508195,-0.015729,-1.092625,...,-1.59547,0.564483,-1.150473,-3.081613,-8.849062,0.2,0.1,0.225,0.3,0.295455


In [41]:
### Build the meta classifier
### Can also use several classifiers here and ensemble them together
meta_classifier = RandomForestClassifier(n_estimators=10,   n_jobs=-1, criterion='gini')
meta_classifier.fit(X_meta,y)
y_submission_meta = meta_classifier.predict_proba(X_submission_meta)[:,1]

In [42]:
pd.DataFrame(y_submission_meta).head()

Unnamed: 0,0
0,0.0
1,0.1
2,0.0
3,1.0
4,0.0


In [52]:
print "AUC Score:",roc_auc_score(y_submission_actual, y_submission_meta)

AUC Score: 0.959065689375
