# Ensemble Learning 

In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import numpy as np
import pandas as pd

## Majority Voting

The algorithm that we are going to implement in this section will allow us to combine different classification
algorithms associated with individual weights for confidence. Our goal is to build a stronger
meta-classifier that balances out the individual classifiers’ weaknesses on a particular dataset.

In [2]:
# Import the iris dataset
iris = datasets.load_iris()


X, y = iris.data[50:, [1,2]], iris.target[50:]


# Initial the label Encoder
le = LabelEncoder()

# Transform the labels
y = le.fit_transform(y)


# Split the data into test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5, 
                                                    random_state=1, 
                                                    stratify=y)

# Create the classifiers
clf_1 = LogisticRegression(penalty='l2', C=0.001, 
                           solver='lbfgs', 
                           random_state=1)


clf_2 = DecisionTreeClassifier(max_depth=1,
                               criterion='entropy',
                               random_state=0
                               )


clf_3 = KNeighborsClassifier(n_neighbors=1,
                             p=2,
                             metric='minkowski'
                             )


# Ensemble Classifier
eclf = EnsembleVoteClassifier(clfs=[clf_1, clf_2, clf_3], weights=[1,1,1], voting='soft')



pipe_1 = Pipeline([['sc', StandardScaler()],
                  ['clf_1', clf_1]]
                  )

pipe_2 = Pipeline([['sc', StandardScaler()],
                   ['clf_2', clf_2]
                   ])


pipe_3 = Pipeline([['sc', StandardScaler()],
                   ['clf_3', clf_3]
                   ])


pipe_4 = Pipeline([['sc', StandardScaler()],
                   ['eclf', eclf]
                   ])


clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN', 'Ensemble Majority Voting']

print('\n10-fold cross validation\n')


for clf, label in zip([pipe_1, pipe_2, pipe_3, eclf], clf_labels):
    scores = cross_val_score(estimator = clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc'
                             )
    
    print(f'roc_auc: {scores.mean():.2f} '
          f'(+/- {scores.std():.2f}) [{label}]')


10-fold cross validation

roc_auc: 0.92 (+/- 0.15) [Logistic Regression]
roc_auc: 0.87 (+/- 0.18) [Decision Tree]
roc_auc: 0.85 (+/- 0.13) [KNN]
roc_auc: 0.97 (+/- 0.10) [Ensemble Majority Voting]


## Evaluating and tuning the ensemble classifier

> Remember that the test data set is not to be used for model selection; its purpose is merely to report an unbiased estimate of the generalization performance of a classifier system:




In [3]:
# Get the best params for the ensemble method

eclf.get_params()

{'logisticregression': LogisticRegression(C=0.001, random_state=1),
 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0),
 'kneighborsclassifier': KNeighborsClassifier(n_neighbors=1),
 'logisticregression__C': 0.001,
 'logisticregression__class_weight': None,
 'logisticregression__dual': False,
 'logisticregression__fit_intercept': True,
 'logisticregression__intercept_scaling': 1,
 'logisticregression__l1_ratio': None,
 'logisticregression__max_iter': 100,
 'logisticregression__multi_class': 'auto',
 'logisticregression__n_jobs': None,
 'logisticregression__penalty': 'l2',
 'logisticregression__random_state': 1,
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.0001,
 'logisticregression__verbose': 0,
 'logisticregression__warm_start': False,
 'decisiontreeclassifier__ccp_alpha': 0.0,
 'decisiontreeclassifier__class_weight': None,
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 1,

> Based on the values returned by the get_params method, we now know how to access the individual
classifier’s attributes. Let’s now tune the inverse regularization parameter, C, of the logistic regression
classifier and the decision tree depth via a grid search for demonstration purposes:

In [4]:
params = {
            'decisiontreeclassifier__max_depth': [1,2],
            'logisticregression__C': [0.0001, 0.001, 0.1, 100.0]
}

# Fit params to the Grid Search CV
grid = GridSearchCV(estimator=eclf,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc'
                    )

grid.fit(X_train, y_train)

# Print the best params
print(grid.best_params_)

# Print the Training Score
print(f'ROC AUC for training data:  {grid.best_score_:.2f}')


# Generalized Score on Test Data
print(f'ROC AUC for test data:  {grid.score(X_test, y_test):.2f}')

{'decisiontreeclassifier__max_depth': 1, 'logisticregression__C': 0.0001}
ROC AUC for training data:  0.97
ROC AUC for test data:  0.97


> As you can see, we get the best cross-validation results when we choose a lower regularization strength
(C=0.0001), whereas the tree depth does not seem to affect the performance at all, suggesting that a
decision stump is sufficient to separate the data. To remind ourselves that it is a bad practice to use
the test dataset more than once for model evaluation, we are not going to estimate the generalization
performance of the tuned hyperparameters in this section.

## Bagging – building an ensemble of classifiers from bootstrap samples

In [5]:
df_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

df_wine = pd.read_csv(df_url, header=None)

df_wine.columns = ['Class label', 'Alcohol',
                    'Malic acid', 'Ash',
                    'Alcalinity of ash',
                    'Magnesium', 'Total phenols',
                    'Flavanoids', 'Nonflavanoid phenols',
                    'Proanthocyanins',
                    'Color intensity', 'Hue',
                    'OD280/OD315 of diluted wines',
                    'Proline']


# Drop 1 class
df_wine = df_wine.loc[df_wine['Class label'] != 1]


y = df_wine['Class label'].values

X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values

In [6]:
# Next, encode the labels
le = LabelEncoder()

y = le.fit_transform(y)

# Check the class mappings
# le.classes_

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1, 
                                                    stratify=y)


tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=None)


# Fit the Decision Tree

tree = tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)

y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)

tree_test = accuracy_score(y_test, y_test_pred)


print(f'Decision tree train/test accuracies '
        f'{tree_train:.3f}/{tree_test:.3f}')

Decision tree train/test accuracies 1.000/0.833


In [7]:
bag = BaggingClassifier(base_estimator=tree, n_estimators=500,
                        max_samples=1.0, max_features=1.0, 
                        bootstrap=True, bootstrap_features=False,
                        n_jobs=-1, random_state=1
                        )

# Fit the Bag

bag = bag.fit(X_train, y_train)

y_train_pred = bag.predict(X_train)

y_test_pred = bag.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)

tree_test = accuracy_score(y_test, y_test_pred)


print(f'Bag train/test accuracies '
        f'{tree_train:.3f}/{tree_test:.3f}')

Bag train/test accuracies 1.000/0.917


> Although the training accuracies of the decision tree and bagging classifier are similar on the training
dataset (both 100 percent), we can see that the bagging classifier has a slightly better generalization
performance, as estimated on the test dataset.

## Applying AdaBoost using scikit-learn



In [8]:
tree = DecisionTreeClassifier(criterion='entropy', 
                              random_state=1, 
                              max_depth=1)



ada = AdaBoostClassifier(base_estimator=tree,
                            n_estimators=500,
                            learning_rate=0.1,
                            random_state=1)


ada = ada.fit(X_train, y_train)

y_train_pred = ada.predict(X_train)

y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred)

ada_test = accuracy_score(y_test, y_test_pred)

print(f'AdaBoost train/test accuracies '
        f'{ada_train:.3f}/{ada_test:.3f}')

AdaBoost train/test accuracies 1.000/0.917


## Implementing XGBoost

In [10]:
model = xgb.XGBClassifier(n_estimators=1500, learning_rate=0.01,
                          max_depth=4, random_state=1, use_label_encoder=False
                          )


gbm = model.fit(X_train, y_train)

y_train_pred = gbm.predict(X_train)

y_test_pred = gbm.predict(X_test)


# Evaluation Metrics
gbm_train = accuracy_score(y_train, y_train_pred)

gbm_test = accuracy_score(y_test, y_test_pred)


print(f'XGboost train/test accuracies '
        f'{gbm_train:.3f}/{gbm_test:.3f}')



XGboost train/test accuracies 0.968/0.917


> Here, I fit the gradient boosting classifier with 1,500 trees (rounds) and a learning rate of 0.01. Typically,
a learning rate between 0.01 and 0.1 is recommended. However, remember that the learning rate
is used for scaling the predictions from the individual rounds. So, intuitively, the lower the learning
rate, the more estimators are required to achieve accurate predictions.