# Bootstrap Aggregation algorithm

In [1]:
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data

In [2]:
# Load the dataset and store it as a pandas dataframe
dataset = pd.read_csv('sonar.all-data',header=None)
# Store features in a pandas dataframe X
X = dataset.iloc[:,:-1]
# Convert X into a numpy array
X = X.to_numpy()

# Store labels in a numpy array y
y = dataset.iloc[:,-1].to_numpy()
# Convert it into a array of boolean (True if 'M' and False otherwise)
y = (y == 'M')
# Convert it into a array of int (1 if 'True' and 0 otherwise)
y = y.astype(int)

print("Number of samples: ", X.shape[0])
print('Number of features: ', X.shape[-1])

Number of samples:  208
Number of features:  60


In [3]:
def KFold_split(X, Y, k=5, test_ratio=0.2, seed=264):
    """
    Split dataset into a test dataset and train/val kfolds
    """
    # Extract test set from entire dataset
    X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=test_ratio, shuffle=True, random_state=seed)
    
    # Create train/validation kfolds splitter
    KFold_splitter = KFold(n_splits=k, shuffle=True, random_state=seed)
    X_train_folds = []
    X_val_folds = []
    Y_train_folds = []
    Y_val_folds = []
    
    # Split train_val dataset into folds
    for (kth_fold_train_idxs, kth_fold_val_idxs) in KFold_splitter.split(X_train_val, Y_train_val):
        X_train_folds.append(X_train_val[kth_fold_train_idxs])
        X_val_folds.append(X_train_val[kth_fold_val_idxs])
        Y_train_folds.append(Y_train_val[kth_fold_train_idxs])
        Y_val_folds.append(Y_train_val[kth_fold_val_idxs])
        
    print("Training dataset size:   ", len(X_train_folds[0]))
    print("Validation dataset size: ", len(X_val_folds[0]))
    print("Test dataset size:       ", len(X_test))
    return X_train_folds, Y_train_folds, X_val_folds, Y_val_folds, X_test, Y_test

### Subsample function


You can use the function [random.choices()](https://docs.python.org/3/library/random.html#random.choices) to get a subsampling with replacement

In [4]:
def bootstrap_sample(
    X,
    y,
    n_samples=None   # number of samples in the subsampling
):
    """
    Create a random subsample from the dataset with replacement
    """
    if n_samples is None:
        n_samples = len(X)
    idx = random.choices(range(len(X)), k=n_samples)
    return X[idx], y[idx]

def count_ratio_unique_bootstrap_sample(X_sample):
    return len(np.unique(X_sample, axis=0))/len(X_sample)

X_train_folds, Y_train_folds, X_val_folds, Y_val_folds, X_test, Y_test = KFold_split(X, y)

X_sample, y_sample = bootstrap_sample(X_train_folds[0],Y_train_folds[0])
print("Ratio of unique element in the subsample: ", count_ratio_unique_bootstrap_sample(X_sample))

Training dataset size:    132
Validation dataset size:  34
Test dataset size:        42
Ratio of unique element in the subsample:  0.6212121212121212


### Bagging train and predict functions


**Note**: in scikit-learn, all supervised estimators implement a ``fit(X, y)`` method and a ``predict(X)`` method with ``X`` being unlabeled observations and  ``y`` being labels. 

Therefore ``Classifier`` parameter can be any sklearn class implementing a supervised classifier.

(See *The problem solved in supervised learning* section in the supervised learning tutorial from [sklearn documentation](https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)

In [5]:
def bagging_train(
    X_train, 
    y_train, 
    n_clfs,                                  # number of classifier
    Classifier = DecisionTreeClassifier,     # Python class of classifier
    clfs_args = {},                          # Specific python class of classifier's arguments
):
    """
    Bootstrap Aggregation training algorithm
    """
    clfs = []
    for i in range(n_clfs):
        # -------------------------
        # Train a new classifier
        # -------------------------
        # Take a subsample of X and Y (with replacement)
        sample_X, sample_y = bootstrap_sample(X_train, y_train)
        # Initialize a new Classifier object
        clf = Classifier(**clfs_args)
        # Train this new Classifier object
        clf.fit(sample_X,sample_y)
        # Append your trained classifier in your list of classifiers 
        clfs.append(clf)
    # Return the list of trained classifiers composing the bagging classifier
    return clfs


def bagging_predict(
    clfs,     # list of classifiers composing the bagging classifier
    X
):
    """
    Bootstrap Aggregation predict algorithm
    """
    y_pred= []
    for row in X:
        # Get a prediction of 'row' for each classifier trained
        predictions = [clf.predict([row])[0] for clf in clfs]
        # Get the most common prediction and append it to 'y_pred'
        y_pred.append(max(predictions, key=predictions.count))
    return(np.array(y_pred))


### Bagging pipeline using cross validation

In [6]:
def bagging_CV_pipeline(
    n_clfs,           # number of classifiers
    Classifier,       # Python class of classifier
    clfs_args = {},   # Classifier's hparams
):
    """
    Cross validation step of the machine learning pipeline for a bagging algorithm
    """
    train_accs = []
    val_accs = []
    
    # For each set of k-folds get the bagging classifier and its accuracy
    for X_train_fold, X_val_fold, y_train_fold, y_val_fold in zip(
        X_train_folds, X_val_folds, Y_train_folds, Y_val_folds
        ):
        
        # 'clfs' are the classifiers associated with the current bagging classifier
        clfs = bagging_train(X_train_fold, y_train_fold, n_clfs, Classifier, clfs_args=clfs_args)
        y_pred = bagging_predict(clfs, X_val_fold)
        
        # Training scores of the current bagging classifier
        y_pred = bagging_predict(clfs, X_train_fold)
        train_accs.append(accuracy_score(y_train_fold, y_pred))
        
        # Validation scores of the current bagging classifier
        y_pred = bagging_predict(clfs, X_val_fold)
        val_accs.append(accuracy_score(y_val_fold, y_pred))
        
    # Return the mean scores 
    return np.mean(train_accs), np.mean(val_accs)

### Training different bagging models on the sonar dataset


In [7]:
mean_train_accs = []
mean_val_accs = []
list_n_trees = [1, 5, 10, 15, 20, 25, 30]

# For each hyper-parameter instance, do KFold cross validation:
for n_trees in list_n_trees:
    mean_train_acc, mean_val_acc = bagging_CV_pipeline(
        n_clfs = n_trees,
        Classifier = DecisionTreeClassifier,
        clfs_args={}
    )
    print('\nNumber of trees: %d' % n_trees)
    print('Mean training accuracy:     %.4f' %mean_train_acc)
    print('Mean validation accuracy:   %.4f' %mean_val_acc)
    
    mean_train_accs.append(mean_train_acc)
    mean_val_accs.append(mean_val_acc)


Number of trees: 1
Mean training accuracy:     0.8810
Mean validation accuracy:   0.6567

Number of trees: 5
Mean training accuracy:     0.9669
Mean validation accuracy:   0.7595

Number of trees: 10
Mean training accuracy:     0.9864
Mean validation accuracy:   0.7414

Number of trees: 15
Mean training accuracy:     0.9970
Mean validation accuracy:   0.7893

Number of trees: 20
Mean training accuracy:     0.9985
Mean validation accuracy:   0.7412

Number of trees: 25
Mean training accuracy:     1.0000
Mean validation accuracy:   0.7704

Number of trees: 30
Mean training accuracy:     1.0000
Mean validation accuracy:   0.7531


### Selecting and evaluating the best bagging model

In [8]:
# Select best model
i_best = np.argmax(mean_val_accs)

# Evaluate best model
X_train_val = np.concatenate([X_train_folds[0], X_val_folds[0]])
Y_train_val = np.concatenate([Y_train_folds[0], Y_val_folds[0]])
best_model = bagging_train(X_train_val, Y_train_val, list_n_trees[i_best])

print("\nBest model selected with n_trees=", list_n_trees[i_best])
y_pred = bagging_predict(best_model, X_train_val)
acc = accuracy_score(Y_train_val, y_pred)
print("\nTraining accuracy: (incl. validation dataset)   ", acc)
y_pred = bagging_predict(best_model, X_test)
acc = accuracy_score(Y_test, y_pred)
print("Test accuracy:                                  ", acc)


Best model selected with n_trees= 15

Training accuracy: (incl. validation dataset)    1.0
Test accuracy:                                   0.7380952380952381
