# Ensemble Learning and Random Forests

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
X, y = datasets.make_moons(n_samples=100, noise=0.15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rf", rnd_clf), ("sv", svm_clf)],
    voting="hard",
)

In [21]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{clf.__class__.__name__}: {accuracy_score(y_test, y_pred):.2f}")

LogisticRegression: 0.87
RandomForestClassifier: 0.97
SVC: 0.93
VotingClassifier: 0.93


## Bagging and Pasting

In [28]:
# The Bagging Classifier automatically performs soft voting if the predictor
# has a predict_proba method
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.4,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True,
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(f"Bagging Classifier Accuracy: {100*accuracy_score(y_test, y_pred):.2f}%")

Bagging Classifier Accuracy: 93.33%


In [30]:
"""
With bagging, some instances may be sampled several times for any given predictor,
while others may not be sampled at all. By default a BaggingClassifier samples m
training instances with replacement ( bootstrap=True ), where m is the size of the
training set. This means that only about 63% of the training instances are sampled on
average for each predictor. 6 The remaining 37% of the training instances that are not
sampled are called out-of-bag (oob) instances. Note that they are not the same 37%
for all predictors.
Since a predictor never sees the oob instances during training, it can be evaluated on
these instances, without the need for a separate validation set. You can evaluate the
ensemble itself by averaging out the oob evaluations of each predictor.
"""
print(f"Out-of-bag Evaluation: {100*bag_clf.oob_score_:.2f}")

Out-of-bag Evaluation: 87.14


In [3]:
rnd_clf = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=16,
    n_jobs=-1 # Use all available CPUs
)
rnd_clf.fit(X_train, y_train)

rnd_clf.predict(X_test)

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1])

In [4]:
iris = datasets.load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"]) # type: ignore
for name, score in zip(
    iris["feature_names"], # type: ignore
    rnd_clf.feature_importances_
):
    print(f"Feature: {name}, Importance Score: {score}")


Feature: sepal length (cm), Importance Score: 0.11703668920410754
Feature: sepal width (cm), Importance Score: 0.024954252447191672
Feature: petal length (cm), Importance Score: 0.4167878329693357
Feature: petal width (cm), Importance Score: 0.441221225379365
