# Selective Bagging
In this notebook we will test an instance of "bagging" where we train multiple models to predict a task. But where instead of taking the average results as a decision, we train another model to choose the weights for each model for the given sample. We will see if this kind of selective bagging compared to a regular bagging method performs better.

## Define the two models to compare

We will compare a random forest classifier with a sklearn pipeline that takes 

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
import numpy as np

class SelectionTree(BaseEstimator, TransformerMixin):
    def __init__(self,num_trees=100, max_depth=None):
        self.trees=[]
        self.n=num_trees
        for i in range(num_trees):
            self.trees.append(DecisionTreeClassifier(max_depth=max_depth, random_state=i*10))
        self.sel_tree = DecisionTreeClassifier(max_depth=max_depth)
    def fit(self, X, y):
        i=0
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
        yt=np.zeros(np.shape(y_val))
        for t in self.trees:
            t.fit(X_train[i*len(y_train)//self.n:(i+1)*len(y_train)//self.n],y_train[i*len(y_train)//self.n:(i+1)*len(y_train)//self.n])
            xt = t.predict(X_val)==y_val
            yt = yt*(1-xt)+xt*i
            print(yt)
            i+=1
            self.sel_tree.fit(np.concatenate([X_val,X_val[0:self.n-1]]), np.concatenate([yt, np.array([i for i in range(self.n-1)])]))
        return self
    def new_transform(self, x):
        tree_mat=np.array([tree.predict_proba(x) for tree in self.trees])
        result=self.sel_tree.predict_proba(X_test).T*np.reshape(tree_mat, (np.shape(tree_mat)[2],np.shape(tree_mat)[0],np.shape(tree_mat)[1]))
        result=np.sum(np.reshape(result, (np.shape(tree_mat)[0],np.shape(tree_mat)[1],np.shape(tree_mat)[2]) ), axis=0)
        return np.argmax(result, axis=1)
    def transform(self, x):
        return [self.trees[i].predict(x)[n] for i, n in zip(np.argmax(self.sel_tree.predict_proba(x), axis=1), range(len(x)))]
    def proba(self, x):
        return self.sel_tree.predict_proba(x)

forest=RandomForestClassifier(n_estimators=10, max_depth=3)

In [46]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=10000, n_features=40,
                           n_informative=30, n_redundant=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [47]:
np.shape(X)

(10000, 40)

In [48]:
forest.fit(X_train,y_train)

RandomForestClassifier(max_depth=3, n_estimators=10)

In [49]:
sel_bag = SelectionTree(num_trees=10, max_depth=3)

In [50]:
sel_bag.fit(X_train, y_train)

[0. 0. 0. ... 0. 0. 0.]
[1. 1. 1. ... 0. 0. 0.]
[2. 1. 2. ... 0. 0. 0.]
[3. 3. 3. ... 0. 3. 0.]
[3. 4. 4. ... 0. 3. 0.]
[5. 5. 5. ... 0. 3. 0.]
[6. 5. 5. ... 0. 3. 0.]
[7. 5. 5. ... 7. 7. 0.]
[8. 8. 8. ... 8. 7. 0.]
[9. 8. 9. ... 8. 7. 0.]


SelectionTree(num_trees=None)

In [51]:
print((forest.predict(X_test)==y_test).sum())

2003


In [52]:
print((sel_bag.transform(X_test)==y_test).sum())

1723


In [53]:
print((sel_bag.new_transform(X_test)==y_test).sum())

1813


In [54]:
sel_bag.new_transform(X_test)

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

In [55]:
sel_bag.proba(X_test)

array([[0.00611621, 0.00611621, 0.0030581 , ..., 0.08562691, 0.26299694,
        0.59021407],
       [0.01032448, 0.00294985, 0.00442478, ..., 0.10471976, 0.20058997,
        0.61061947],
       [0.01032448, 0.00294985, 0.00442478, ..., 0.10471976, 0.20058997,
        0.61061947],
       ...,
       [0.0031746 , 0.        , 0.0031746 , ..., 0.04761905, 0.17142857,
        0.72698413],
       [0.0031746 , 0.        , 0.0031746 , ..., 0.04761905, 0.17142857,
        0.72698413],
       [0.        , 0.        , 0.00480769, ..., 0.03365385, 0.14423077,
        0.75961538]])