In [65]:
'''
Purpose: Two stage model. We use multiple classifiers at the base-stage to generate additional (prediction) features. 
A single meta-classifier is then used to make a final prediction

See: https://github.com/daxiongshu/kaggle-tradeshift-winning-solution/blob/master/TradeshiftTextClassification.pdf
'''

from __future__ import division
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import pandas as pd



In [66]:
### Load some data
from sklearn.datasets import load_digits 
digits = load_digits()
print(digits.data.shape)


(1797, 64)


In [67]:
### Just break it up to some train/test set
X, y = digits['data'], digits['target']
X, X_sumbission = X[0:1300],X[1300:]
y, y_submission_actual = y[0:1300],y[1300:]


In [68]:
print np.shape(X)
print np.shape(X_submission)
print np.shape(y)


(1300, 64)
(497, 64)
(1300,)


In [69]:
'''
Split training set into 'base' and 'meta' components.
This can be done 3 ways:
    1) Random 50-50 Split
    2) First half for base, second half for meta
    3) First half for meta, second half for base
    
You should test all three ways and pick which gives best scores 
    -OR- just do all three and ensemble them together at the end
'''

## For now, just split random 50%
X_base, X_meta, y_base, y_meta = train_test_split(X, y, test_size=0.5, random_state=42)

print np.shape(X_base)
print np.shape(X_meta)
print np.shape(y_base)
print np.shape(y_meta)


(650, 64)
(650, 64)
(650,)
(650,)


In [70]:
'''
Train on base data and predict on meta

Here we can train mutiple models, all parameters should be tuned using a grid search and CV
'''

### We can use as many classifiers as we want 
clf_base_rf = RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='gini')
clf_base_et = ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='gini')
       
### We only train on the 'base' part of the data
clf_base_rf.fit(X_base, y_base)
clf_base_et.fit(X_base, y_base)

print "Done training...."

X_meta_pred_rf = clf_base_rf.predict_proba(X_meta)[:,1]
X_meta_pred_et = clf_base_et.predict_proba(X_meta)[:,1]


Done training....


In [71]:
'''
Train meta data with raw features and meta features (predictions).
This only uses ONE classifier and combines all the raw + meta features
'''

### Here we build only a single classfier 
clf_meta = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini')

### The meta classifier is trained with all raw and prediction features
clf_meta.fit( np.column_stack((X_meta, X_meta_pred_rf, X_meta_pred_et)), y_meta) 

print "Done training...."


Done training....


In [72]:
'''
For the test set, we want to train on ALL the avaliable data!
In this way, the meta features of the test set become more informative, 
hence generating more accurate final predictions.
'''

### We only train on the full the data
clf_base_rf.fit(X, y)
clf_base_et.fit(X, y)

print "Done training...."

X_test_pred_rf = clf_base_rf.predict_proba(X_sumbission)[:,1]
X_test_pred_et = clf_base_et.predict_proba(X_sumbission)[:,1]


Done training....


In [73]:
'''
Final prediction using the meta model
'''

pred = clf_meta.predict_proba( np.column_stack((X_submission, X_test_pred_rf, X_test_pred_et)) )[:,1]

In [78]:
#'''
#We can also use training on the full set of data with other classifiers
#'''
#clf_full_lr = LogisticRegression()
#clf_full_lr.fit(X,y)
#X_test_pred_full_lf = clf_full_lr.predict_proba(X_sumbission)[:,1]
#
#pred2 = clf_meta.predict_proba( np.column_stack((X_submission, X_test_pred_rf, X_test_pred_et, X_test_pred_full_lf)) )[:,1]
#