# Part III: Ensembles and Final Result

## AdaBoost

Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import proj2_lib.preprocess as preprocess
train_X, train_y = preprocess.load_train_data()
print(train_X.shape)
print(train_y.shape)

(90526, 101)
(90526,)


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

stump = DecisionTreeClassifier(max_depth=1)
adaboost = AdaBoostClassifier(stump)

In [6]:
from sklearn.model_selection import cross_val_score
adaboost_auc = cross_val_score(adaboost, train_X, train_y,
                              scoring='roc_auc')


In [7]:
import numpy as np

# mean AUC of adaboost
np.mean(adaboost_auc)

0.72590750897804091

## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [122]:
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.pipeline import make_union
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

class ClassificationTransformer(TransformerMixin):
    def __init__(self, estimator=DecisionTreeClassifier(),
                func='predict'):
        self.estimator = estimator
        self.func = func
        
    def fit(self, X, y):
        self.estimator = self.estimator.fit(X, y)
        return self
        
    def transform(self, X):
        func = None
        if self.func == 'predict':
            func = self.estimator.predict
        elif self.func == 'decision_function':
            func = self.estimator.decision_function
        elif self.func == 'predict_proba':
            func = lambda X: self.estimator.predict_proba(X)[:,0]
        else:
            func = self.func = self.estimator.predict
        
        return func(X)
    
class StackedClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators=[(DecisionTreeClassifier(), 'predict_proba')]):
        self.estimators = estimators
        transformers = []
        for estimator, func in estimators:
            transformer = ClassificationTransformer(estimator=estimator, func=func)
            transformers.append(transformer)
        
        self.nestimators_ = len(transformers)
        self.pipeline_ = make_union(*transformers)
        self.logreg_ = LogisticRegression()
        
    def transform_(self, X):
        nobs = X.shape[0]
        XX = self.pipeline_.transform(X)
        return XX.reshape((nobs, self.nestimators_))
    
    def fit(self, X, y):
        ssplit = StratifiedShuffleSplit(n_splits=1, test_size=.2)
        for train_index, test_index in ssplit.split(X, y):
            train_X = X[train_index,:]
            validation_X = X[test_index, :]
            
            train_y = y[train_index]
            validation_y = y[test_index]
        
        self.pipeline_ = self.pipeline_.fit(train_X, train_y)
        
        XX = self.transform_(validation_X)
        self.logreg_ = self.logreg_.fit(XX, validation_y)
        return self
    
    def decision_function(self, X):
        XX = self.transform_(X)
        return self.logreg_.decision_function(XX)
    
    def predict(self, X):
        return np.sign(self.decision_function(X))
    

In [123]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC

estimators = [
    (AdaBoostClassifier(), 'decision_function'),
    (LinearSVC(C=0.01), 'decision_function'),
    (LinearSVC(C=1.0), 'decision_function'),
    (LinearSVC(C=100.0), 'decision_function'),
    (RandomForestClassifier(n_estimators=10), 'predict_proba'),
    (RandomForestClassifier(n_estimators=100), 'predict_proba')
]

stack = StackedClassifier(estimators=estimators)
stack = stack.fit(train_X[:1000,:], train_y[:1000])

In [124]:
res = stack.predict(train_X)
res[:10]

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])

Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

In [125]:
from sklearn.model_selection import cross_val_score

stack = StackedClassifier(estimators=estimators)
stack_auc = cross_val_score(stack, train_X, train_y,
                            n_jobs=4, cv=10, verbose=1,
                            scoring='roc_auc')

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.9min finished


In [126]:
np.mean(stack_auc)

0.50490947504689243

## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [127]:
# final result goes here
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
adaboost = adaboost.fit(train_X, train_y)

In [129]:
test_X, test_y = preprocess.load_test_data()

In [131]:
pred_y = adaboost.predict_proba(test_X)

In [134]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, pred_y[:,1])

0.72970096598574885