In [1]:
%load_ext autoreload
%autoreload 2
import noshow_lib.util as utils

In [2]:
utils.file_config

{'raw_data_path': 'data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'processed_data_path': 'processed_data',
 'train_csv': 'train_set.csv',
 'test_csv': 'test_set.csv',
 'objstore_path': 'objects',
 'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl'}

In [3]:
file_config = utils.file_config

In [4]:
import noshow_lib.preprocess as preprocess
train_X, train_y = preprocess.load_train_data(config=file_config)

In [5]:
print(train_X.shape)
print(train_y.shape)

(90526, 113)
(90526,)


In [6]:
test_X, test_y = preprocess.load_test_data(config=file_config)

In [7]:
print(test_X.shape)
print(test_y.shape)

(20000, 113)
(20000,)


# Part III: Ensembles and Final Result

## AdaBoost

Train an AdaBoost classifier and compare its performance to results obtained in Part II using 10 fold CV.

In [8]:
import numpy as np

SEED = 7
np.random.seed(SEED)

In [9]:
# AdaBoost code goes here
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

num_trees = 70

model = AdaBoostClassifier(n_estimators=num_trees, random_state=SEED)
results = cross_val_score(model, train_X, train_y, cv=10)
print(results.mean())

0.7973068170408475


## xgBoost

Train an xgBoost classifier and compare its performance to results in Part II using 10 fold CV. `sklearn` has a gradient boosting model included http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html which you can use. The `xgboost` package https://xgboost.readthedocs.io/en/latest/python/python_intro.htmlhas a wrapper you can use with sklearn as well https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn. The latter is more efficient at training time.

In [10]:
# xgboost code here
from sklearn.ensemble import GradientBoostingClassifier

num_trees = 100

model = GradientBoostingClassifier(n_estimators=num_trees, random_state=SEED)
results = cross_val_score(model, train_X, train_y, cv=10)
print(results.mean())

0.7981574345977211


## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [11]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier,ExtraTreesClassifier)

def get_models():
    #Generate a library of base learners
    rfc = RandomForestClassifier(max_features= 'sqrt', n_jobs=4, n_estimators=150, oob_score = True) 
    gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    rf = RandomForestClassifier(n_estimators=150, n_jobs=4, max_features=20, random_state=SEED)
    ab = AdaBoostClassifier(n_estimators=70, random_state=SEED)
    et = ExtraTreesClassifier(n_estimators=10, criterion='entropy')

    models = {'random forest-sqrt': rfc,
              'GradientBoosting': gb,
              'random forest-20': rf,
              'AdaBoost': ab,
              'ExtraTreesClassifier': et,
              }

    return models

In [12]:
def train_base_learners(base_learners, X, y):
    for i, (name, m) in enumerate(base_learners.items()):
        m.fit(X, y)

In [13]:
def predict_base_learners(pred_base_learners, X):
    P = np.zeros((X.shape[0], len(pred_base_learners)))

    for i, (name, m) in enumerate(pred_base_learners.items()):
        p = m.predict_proba(X)
        P[:, i] = p[:, 1]

    return P

In [14]:
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit

lr = LogisticRegression(random_state=SEED)

def build_stack_ensemble(X, y):
    
    base_learners = get_models()
    lr_learner = clone(lr)    
    
    train_base_learners(base_learners, X, y)

    print("Generating cross-validated predictions...")
    cv_preds, cv_y = [], []
    
    generator = StratifiedShuffleSplit(n_splits=1)
    for train_index, test_index in generator.split(X, y):
        fold_xtrain, fold_xtest = X[train_index], X[test_index]
        fold_ytrain, fold_ytest = y[train_index], y[test_index]

        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        
        train_base_learners(fold_base_learners, fold_xtrain, fold_ytrain)

        fold_P_base = predict_base_learners(fold_base_learners, fold_xtest)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)

    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)

    # Train meta learner
    lr_learner.fit(cv_preds, cv_y)
    
    print("Done")

    return base_learners, lr_learner

Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [15]:
def ensemble_predict(base_learners, meta_learner, inpX):
    P_pred = predict_base_learners(base_learners, inpX)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [16]:
# final result goes here
# Train with stacking
cv_base_learners, cv_meta_learner = build_stack_ensemble(train_X, train_y)

P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, test_X)

Generating cross-validated predictions...
Done


In [17]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(test_y, p))


Ensemble ROC-AUC score: 0.749


In [18]:
from sklearn.model_selection import cross_val_score

results = cross_val_score(cv_meta_learner, train_X, train_y, cv=10)
print("\nEnsemble mean AUC score: %.3f" % results.mean())


Ensemble mean AUC score: 0.795


### Conclusion
My stacking doesn't have a better performance than AdaBoost and GradientBoosting.

In [25]:
### Try with python library
from mlxtend.classifier import StackingClassifier

base_learners = get_models()

lr = LogisticRegression(random_state=SEED)

sclf = StackingClassifier(classifiers=list(base_learners.values()), meta_classifier=lr)

scores = cross_val_score(sclf, train_X, train_y, cv=10, scoring='accuracy')
print("StackingClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

StackingClassifier Accuracy: 0.78 (+/- 0.00)


### Conclusion
Python stacking library performance is even worse than mine. Why??