In [1]:
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Generate synthetic classification data

In [4]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                           n_redundant=5, n_classes=2, random_state=42)

In [6]:
X

array([[ -4.90644173,   3.44278902,   0.55896373, ...,   3.01285895,
         -4.4970026 ,  -2.52006572],
       [  2.16261046,  -5.2866508 ,   2.60984566, ...,  -4.56217784,
          3.69866457,  -1.92328575],
       [ -4.78484428,  -3.74482699,   4.65759163, ...,  -7.8446457 ,
          2.80379841,  -2.96318945],
       ...,
       [-10.4123717 ,  -5.32462138,  -1.03805762, ...,   8.97804725,
          3.57786925,   4.28159633],
       [  0.28820005,   2.83863363,   2.79969055, ...,  -6.31392968,
         -1.85129503,  -4.92026386],
       [  2.83104091,   4.03525551,   0.25816518, ..., -12.18842291,
          2.16378616,   4.64310326]])

In [8]:
y

array([0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,

# 2. Split data into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Define the base estimator (optional, default is DecisionTreeClassifier)
#    Using a non-default base estimator for demonstration:
#    from sklearn.linear_model import LogisticRegression
#    base_estimator = LogisticRegression()
#    If you want the default (Decision Tree), you can omit the 'estimator' argument
#    or explicitly define it:

In [14]:
base_estimator = DecisionTreeClassifier(random_state=42)

# 4. Create the Bagging Classifier

In [None]:
#    Key Parameters:
#    - estimator: The base learning algorithm. (Note: use 'estimator' instead of 'base_estimator' from scikit-learn 1.2+)
#    - n_estimators: The number of base estimators (trees) in the ensemble.
#    - max_samples: The number/fraction of samples to draw for training each base estimator.
#    - max_features: The number/fraction of features to draw for training each base estimator.
#    - bootstrap: Whether samples are drawn with replacement (True for Bagging).
#    - bootstrap_features: Whether features are drawn with replacement.
#    - oob_score: Whether to use out-of-bag samples to estimate generalization error.
#    - random_state: Controls the randomness for reproducibility.
#    - n_jobs: Number of CPU cores to use (-1 means use all available).

In [17]:
bagging_clf = BaggingClassifier(
    estimator=base_estimator,
    n_estimators=50,        # Build an ensemble of 50 decision trees
    max_samples=0.8,        # Use 80% of samples for each tree (sampling with replacement)
    max_features=0.8,       # Use 80% of features for each tree
    bootstrap=True,
    bootstrap_features=False, # Usually False for features
    oob_score=True,         # Calculate OOB score
    random_state=42,
    n_jobs=-1               # Use all available CPU cores
)

# 5. Train the Bagging Classifier

In [20]:
print("Training Bagging Classifier...")
bagging_clf.fit(X_train, y_train)
print("Training complete.")

Training Bagging Classifier...
Training complete.


# 6. Make predictions

In [23]:
y_pred = bagging_clf.predict(X_test)

In [25]:
y_pred

array([0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0])

# 7. Evaluate the model

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nBagging Classifier Accuracy: {accuracy:.4f}")


Bagging Classifier Accuracy: 0.8967


# 8. Check the Out-Of-Bag (OOB) Score
# The OOB score is an estimate of the model's performance on unseen data,
# calculated using the samples not included in the bootstrap sample for each tree.

In [30]:
if bagging_clf.oob_score_:
    print(f"Bagging Classifier OOB Score: {bagging_clf.oob_score_:.4f}")

Bagging Classifier OOB Score: 0.8771


# --- Optional: Compare with a single base estimator ---

In [35]:
print("\n--- Comparison with a Single Decision Tree ---")
single_tree_clf = DecisionTreeClassifier(random_state=42)


--- Comparison with a Single Decision Tree ---


In [37]:
single_tree_clf.fit(X_train, y_train)

In [39]:
y_pred_single = single_tree_clf.predict(X_test)

In [41]:
accuracy_single = accuracy_score(y_test, y_pred_single)
print(f"Single Decision Tree Accuracy: {accuracy_single:.4f}")

Single Decision Tree Accuracy: 0.7967
