In [5]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
X,y = make_classification(n_samples=10000, n_features=10,n_informative=3)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print("Decision Tree accuracy",accuracy_score(y_test,y_pred))

Decision Tree accuracy 0.856


In [3]:
bag=BaggingClassifier(
    n_estimators=500,
    max_samples=0.5,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

- `n_estimators` mean number of models
- `max_samples` ratio of training data to be sent to individual model
- `bootstrap` whether to replace the row already selected, back into the dataset


In [4]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging classifier accuracy",accuracy_score(y_test,y_pred))

Bagging classifier accuracy 0.8985


### BAGGING WITH SVM

In [17]:
bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [18]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Bagging SVM accuracy",accuracy_score(y_test,y_pred))

Bagging SVM accuracy 0.878


### PASTING

Each model is trained on different data and rows are not repeated

In [14]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    n_jobs=-1,
    random_state=42
)

In [15]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Pasting classifier accuracy",accuracy_score(y_test,y_pred))

Pasting classifier accuracy 0.8925


### RANDOM SUBSPACES

features are distributed randomly with full rows

In [21]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    n_jobs=-1,
    random_state=42
)

In [22]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Subspaces classifier accuracy",accuracy_score(y_test,y_pred))

Random Subspaces classifier accuracy 0.899


### RANDOM PATCHES

mix of bagging and random subspaces

In [23]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

In [25]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("Random Patches classifier accuracy",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    9.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Random Patches classifier accuracy 0.892


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished


### BAGGING WITH OOB

In [26]:
bag=BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=4
)

In [28]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print("accuracy",accuracy_score(y_test,y_pred))

accuracy 0.8935


In [29]:
bag.oob_score_

0.8945

the oob score gives us a rough estimate on how our model will behave on unseen data and this saves our data as we do not have to split it into training and testing data separately

oob can only be used only if `bootstrap = True` as there will be no 'out of bag' data in case of `boostrap = False`