Recall:  
Bagging - randomly pulling instances from the "bag" of training data, **repeats allowed** (aka with replacement) for each model  
Pasting - randomly pulling instances from the "bag" of training data, **no repeats** (aka without replacement) for each model  
Bagging typically does better than pasting  
Bagging regressor is available but not demo'd

In [45]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import numpy as np

X, y = make_moons(n_samples=1000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Bagging Classifier

In [46]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, # 500 decision trees with 100 instances each
    max_samples=100, bootstrap=True, n_jobs=-1 # bootstrap = True makes it bagging classifier
)
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=100, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [47]:
y_pred = bag_clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.87

### Pasting Classifier

In [48]:
paste_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    max_samples=100, bootstrap=False, n_jobs=-1 # bootstrap = False makes it pasting classifier
)
paste_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=False, bootstrap_features=False, max_features=1.0,
         max_samples=100, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [49]:
y_pred = paste_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.87

### Out of bag evaluation
(Make better use of the non-sampled data for CV), since only ~63% of data gets sampled out of the bag

In [50]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True # use out of bag samples to estimate generalization error
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8225

In [51]:
y_pred = paste_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.87

The real accuracy score is relatively close to the out of bag accuracy score

In [52]:
bag_clf.oob_decision_function_ # gives the probabilities of each instance belonging to class 0 or class 1

array([[0.00537634, 0.99462366],
       [0.17874396, 0.82125604],
       [0.02453988, 0.97546012],
       ...,
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.87027027, 0.12972973]])

## Random Patches and Random Subspaces
Random patches - sampling both training instances AND features  
Random subspaces - sampling ONLY random features (aka **only certain dimensions** or subspaces)

### Random Patches

In [54]:
rnd_patch_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, max_samples=100,
    max_features=0.5, bootstrap_features=True, # bagging with features enabled and only choose up to half of the features
    oob_score=True
)

rnd_patch_clf.fit(X_train, y_train)
rnd_patch_clf.oob_score_

0.82625

In [55]:
y_pred = rnd_patch_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.845

### Random Subspaces

In [57]:
rnd_subspace_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=False, n_jobs=-1, # bootstrap=False and max_samples=1.0 trains across all the data
    max_features=0.5, bootstrap_features=True, # bagging with features enabled and only choose up to half of the features
)
# oob score not available for feature sampling
rnd_subspace_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=False, bootstrap_features=True, max_features=0.5,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [58]:
y_pred = rnd_subspace_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.71