# 12. Approaching Ensembling and Stacking

**Ensembling** in short is combination of different models. We end up with a result by averaging the outputs of every model in simplest usage. Models shouldn't be correlated for better results. Two main ways for averagins:

- If results are probabilities, then a simple mean might be used. 
- If results are predictions, then a majority voting might be used.



In [1]:
%matplotlib inline
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

Every column indicates the prediction of a model for that instance (datapoint)

|Instance | Model 1 | Model 2 | Model 3|
|---|---|---|---|
|$x_{1}$| 0.1| 0.3|0.1|
|$x_{2}$| 0.9| 0.7|0.6|
|$x_{2}$| 0.3| 0.3|0.4|


```python
# python code for the results
results = np.array([[0.1, 0.3, 0.1],
                    [0.9, 0.7, 0.6],
                    [0.3, 0.3, 0.4]])
```

In [16]:

result_lab = np.array([[0,0,1],
                    [0,1,2],
                    [2,2,2]])

result_prob = np.array([[0.1, 0.3, 0.1],
                    [0.9, 0.7, 0.6],
                    [0.3, 0.3, 0.4]])


def mean_pred(probas):
    return np.mean(probas, axis=1)

# The implementation in the book outputs maximum. This is majority voting
def max_voting(preds):
    result = [0] * preds.shape[0]
    for idx, item in enumerate(preds):
        c = Counter(item)
        result[idx] = c.most_common(1)[0][0]
    return result


def rank_mean(probas):
    
    ranked = []
    for i in range(probas.shape[1]):
        rank_data = stats.rankdata(probas[:,i])
        ranked.append(rank_data)
    ranked = np.column_stack(ranked)
    return np.mean(ranked, axis=1)


print(mean_pred(result_prob))
print(max_voting(result_lab))
print(rank_mean(result_prob))


[0.16666667 0.73333333 0.33333333]
[0, 0, 2]
[1.16666667 3.         1.83333333]


In [13]:
from functools import partial
from scipy.optimize import fmin
from sklearn import metrics

class OptimizeAUC:
    
    def __init__(self):
        self.coef_ = 0
        
        
    def _auc(self, coef, X, y):
        
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=1)
        if len(set(y)) > 2:
            auc_score = metrics.roc_auc_score(y, predictions, multi_class='ovr')
        else:
            auc_score = metrics.roc_auc_score(y, predictions)
        
        return -1.0 * auc_score
    
    def fit(self, X, y):
        loss_partial = partial(self._auc, X=X, y=y)
        initial_coef =  np.random.dirichlet(np.ones(X.shape[1]), size=1)
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)
        
    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

# Creating the toy dataset
X, y = make_classification(n_samples=10_000, n_features=30)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, 
                                                    shuffle=True, 
                                                    stratify=y)

clf = LogisticRegression(solver='sag')

results = cross_validate(clf, X_train, y_train, 
                         cv=5, 
                         scoring='accuracy', 
                         n_jobs=-1, 
                         return_estimator=True)

for idx, item in results.items():
    print(f'{idx}: {item}')
    
test_score = np.array(results['test_score'])
model_idx = np.argmax(test_score)
model = results['estimator'][model_idx]
print('\n', model.get_params())
preds = model.predict(X_test)

accuracy = accuracy_score(y_test, preds)
print(f'Test accuracy: {accuracy}')

fit_time: [0.12819815 0.13134813 0.12959719 0.11404514 0.09048676]
score_time: [0.00116539 0.0011251  0.00106859 0.00079179 0.00072479]
estimator: [LogisticRegression(solver='sag'), LogisticRegression(solver='sag'), LogisticRegression(solver='sag'), LogisticRegression(solver='sag'), LogisticRegression(solver='sag')]
test_score: [0.90055556 0.89166667 0.90666667 0.89555556 0.90277778]

 {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'sag', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Test accuracy: 0.897


In [15]:
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split

X, y  = make_classification(n_samples=10_000, n_features=25)

X_fold1, X_fold2, y_fold1, y_fold2 = train_test_split(X, y, test_size=0.5, stratify=y)

logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

logreg.fit(X_fold1, y_fold1)
rf.fit(X_fold1, y_fold1)
xgbc.fit(X_fold1, y_fold1)

pred_logreg = logreg.predict_proba(X_fold2)[:, 1]
pred_xgbc = xgbc.predict_proba(X_fold2)[:, 1]
pred_rf = rf.predict_proba(X_fold2)[:, 1]

avg_pred = (pred_logreg + pred_xgbc + pred_rf) / 3


fold2_preds = np.column_stack((pred_logreg, 
                               pred_xgbc, 
                               pred_rf,
                              avg_pred))

auc_fold2 = []
for i in range(fold2_preds.shape[1]):
    auc = metrics.roc_auc_score(y_fold2, fold2_preds[:,i])
    auc_fold2.append(auc)
    
print(f"Fold-2:LR AUC = {auc_fold2[0]}")
print(f"Fold-2:RF AUC = {auc_fold2[1]}")
print(f"Fold-2:XGB AUC = {auc_fold2[2]}")
print(f"Fold-2:Average Pred AUC = {auc_fold2[3]}")

logreg = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier()
xgbc = xgb.XGBClassifier()

logreg.fit(X_fold2, y_fold2)
rf.fit(X_fold2, y_fold2)
xgbc.fit(X_fold2, y_fold2)

pred_logreg = logreg.predict_proba(X_fold1)[:, 1]
pred_xgbc = xgbc.predict_proba(X_fold1)[:, 1]
pred_rf = rf.predict_proba(X_fold1)[:, 1]

avg_pred = (pred_logreg + pred_xgbc + pred_rf) / 3


fold1_preds = np.column_stack((pred_logreg, 
                               pred_xgbc, 
                               pred_rf,
                              avg_pred))

auc_fold1 = []
for i in range(fold1_preds.shape[1]):
    auc = metrics.roc_auc_score(y_fold1, fold1_preds[:, i])
    auc_fold1.append(auc)
    
print(f"Fold-1:LR AUC = {auc_fold1[0]}")
print(f"Fold-1:RF AUC = {auc_fold1[1]}")
print(f"Fold-1:XGB AUC = {auc_fold1[2]}")
print(f"Fold-1:Average Pred AUC = {auc_fold1[3]}")

opt = OptimizeAUC()
opt.fit(fold1_preds[:, :-1], y_fold1)
opt_preds_fold2 = opt.predict(fold2_preds[:,:-1])
auc = metrics.roc_auc_score(y_fold2, opt_preds_fold2)

print(f"Optimized AUC, Fold 2 = {auc}")
print(f"Coefficients = {opt.coef_}")

opt = OptimizeAUC()
opt.fit(fold2_preds[:, :-1], y_fold2)
opt_preds_fold1 = opt.predict(fold1_preds[:, :-1])
auc = metrics.roc_auc_score(y_fold1, opt_preds_fold1)
print(f"Optimized AUC, Fold 1 = {auc}")
print(f"Coefficients = {opt.coef_}")






Fold-2:LR AUC = 0.9819228771076604
Fold-2:RF AUC = 0.9932872789259646
Fold-2:XGB AUC = 0.9923511987761919
Fold-2:Average Pred AUC = 0.9918726386996221




Fold-1:LR AUC = 0.9878219180515069
Fold-1:RF AUC = 0.9954627192740352
Fold-1:XGB AUC = 0.9954403992704638
Fold-1:Average Pred AUC = 0.995201119232179
Optimization terminated successfully.
         Current function value: -0.995642
         Iterations: 41
         Function evaluations: 87
Optimized AUC, Fold 2 = 0.9926998388319743
Coefficients = [-0.02345547  0.04296401  0.27837574]
Optimization terminated successfully.
         Current function value: -0.992966
         Iterations: 39
         Function evaluations: 82
Optimized AUC, Fold 1 = 0.995186719229875
Coefficients = [-0.02957242  0.62947233  0.14069287]


## Scikit-Learn Example

This is a more clean example of using voting between models. 

In [18]:
from sklearn import datasets
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
X, y  = make_classification(n_samples=10_000, n_features=25)


clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
    voting='hard')
clfs_and_names = zip([clf1, clf2, clf3, eclf], 
    ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble'])

for clf, label in clfs_and_names:
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=2)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.86 (+/- 0.01) [Logistic Regression]
Accuracy: 0.92 (+/- 0.01) [Random Forest]
Accuracy: 0.87 (+/- 0.01) [naive Bayes]
Accuracy: 0.87 (+/- 0.01) [Ensemble]
