# Ensemble Learning and Random Forests

## 1. Voting Classifiers

Create and train 2 voting classifiers hard and soft from 3 different models each and compare their accuracies with the voting model

In [28]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x, y = make_moons(n_samples=5000, noise=0.5)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

hard_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)

for clf in (log_clf, rnd_clf, svm_clf, hard_voting_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(x_test)))

LogisticRegression 0.817
RandomForestClassifier 0.816
SVC 0.828
VotingClassifier 0.83


In [30]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

soft_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)

for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(x_test)))

LogisticRegression 0.817
RandomForestClassifier 0.812
SVC 0.828
VotingClassifier 0.833


## 2. Bagging and Pasting

If a base model can estimape probabilities then the `BaggingClassifier` is soft voting, else hard voting

In [32]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),  
    n_estimators=500,
    max_samples=100,
    bootstrap=True,  # True - bagging, False - pasting
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
accuracy_score(y_test, bag_clf.predict(x_test))

0.832

In [33]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=False,  # True - bagging, False - pasting
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
accuracy_score(y_test, bag_clf.predict(x_test))

0.832

**Out-of-Bag Evaluation**

In [34]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),  
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True  # <--
)
bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

0.8245

In [35]:
accuracy_score(y_test, bag_clf.predict(x_test))

0.829

In [36]:
bag_clf.oob_decision_function_  # probabilities of 0 and 1

array([[0.52263374, 0.47736626],
       [0.34291581, 0.65708419],
       [0.96747967, 0.03252033],
       ...,
       [0.89876033, 0.10123967],
       [0.32106339, 0.67893661],
       [0.04536082, 0.95463918]])