# Ensemble

## 1. Voting
### 1.1 Load Iris data for comparison

In [1]:
import numpy as np

In [28]:
from sklearn.datasets import load_iris

def prep_train_test(rate=0.9):
    iris = load_iris()
    X = iris.data
    y = iris.target
    ind = []
    
    for i in range(3):
        ind.append(np.random.choice(50, int(50*rate), replace=False) + 50*i)
    
    train_ind = np.concatenate(ind)
    test_ind = np.setdiff1d(np.arange(150), train_ind)
        
    
    return X[train_ind], y[train_ind], X[test_ind], y[test_ind]

iris_train_data, iris_train_labels, iris_test_data, iris_test_labels = prep_train_test(0.9)

## 2. Train Data on individual Classifier vs voting classifier

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

nbr_clfs, dtr_clfs = [], []

# make 4 knn
for d in range(1, 5):
    nbr_clfs.append((str(d), KNeighborsClassifier(n_neighbors=d)))

# make 5 decision trees
for d in range(1, 6):
    dtr_clfs.append((str(d+4), DecisionTreeClassifier(max_depth=d)))

# make a voting classifier that takes majority vote among the 4 knn's and 5 decision trees
voting_clf = VotingClassifier(estimators=nbr_clfs + dtr_clfs)

clfs = nbr_clfs + dtr_clfs  + [('voting', voting_clf)]

# look at the accuracy of each classifier
for clf in clfs:
    clf[1].fit(iris_train_data, iris_train_labels)
    y_pred = clf[1].predict(iris_test_data)
    print(clf[1].__class__.__name__, clf[0], accuracy_score(iris_test_labels, y_pred))




KNeighborsClassifier 1 1.0
KNeighborsClassifier 2 0.9333333333333333
KNeighborsClassifier 3 1.0
KNeighborsClassifier 4 1.0
DecisionTreeClassifier 5 0.6666666666666666
DecisionTreeClassifier 6 1.0
DecisionTreeClassifier 7 1.0
DecisionTreeClassifier 8 1.0
DecisionTreeClassifier 9 1.0
VotingClassifier voting 1.0


## 3. Bagging (and Pasting == without replacement)

### 3.1 Load CIFAR-10 data

In [2]:
# Load the data
# data is first downloweded into DATA_PATH from https://www.cs.toronto.edu/~kriz/cifar.html
import os

DATA_PATH = os.path.join('data', 'cifar')
FILE_NAMES = ['batches.meta'] + ['data_batch_{}'.format(x+1) for x in range(5)] + ['test_batch']

def unpickle(file):
    import pickle    
    
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def load_cifar(data_path=DATA_PATH, file_names=FILE_NAMES):
    data = []
    
    for file_name in FILE_NAMES:
        data.append(unpickle(os.path.join(data_path, file_name)))
    
    return data 

data = load_cifar()

### 3.2 Bagging Classifier

In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier




def fit_bagging(data_size=60000):
    bag_clf = BaggingClassifier(KNeighborsClassifier(),
                           n_estimators=500,
                           max_samples=100,
                           bootstrap=False,
                           n_jobs=-1)
    
    label_type = data[-2][b'data'].dtype
    bag_clf.fit(data[1][b'data'][:data_size,:], 
            np.array(data[1][b'labels'][:data_size], dtype=label_type))
    
    return bag_clf

bag_clf = fit_bagging()


In [4]:
from sklearn.metrics import accuracy_score
test_data = data[-1][b'data'][:1000,:]
test_label = data[-1][b'labels'][:1000]
y_pred = bag_clf.predict(test_data)
accuracy_score(test_label, y_pred)

0.237

Compare with the bench mark of 0.1796 in the case of just a single kNN