## Ensemble Learning and Random Forest

In [8]:
from __future__ import division,print_function,unicode_literals

import numpy as np
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12

In [10]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#### Hard Voting

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state = 42)
rf_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(random_state= 42)

vc_clf = VotingClassifier(estimators=[('lr',lr_clf),('rf',rf_clf),('svc',svc_clf)],voting = 'hard')
vc_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [26]:
from sklearn.metrics import accuracy_score

for clf in (vc_clf,lr_clf,rf_clf,svc_clf):
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    print(clf.__class__.__name__,': {}'.format(accuracy_score(y_test,pred)))

VotingClassifier : 0.87
LogisticRegression : 0.85
RandomForestClassifier : 0.88
SVC : 0.87


#### Soft Voting

Soft voting works on predict_proba

In [30]:
lr_clf = LogisticRegression(random_state = 42)
rf_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(random_state= 42,probability=True) # SVC by default is not calculate probability

vc_clf = VotingClassifier(estimators=[('lr',lr_clf),('rf',rf_clf),('svc',svc_clf)],voting = 'soft')
vc_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [31]:
from sklearn.metrics import accuracy_score

for clf in (vc_clf,lr_clf,rf_clf,svc_clf):
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    print(clf.__class__.__name__,': {}'.format(accuracy_score(y_test,pred)))

VotingClassifier : 0.9
LogisticRegression : 0.85
RandomForestClassifier : 0.88
SVC : 0.87


#### Bagging ensembles

In [38]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bc_clf = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(random_state=42),
    n_estimators = 500,
    max_samples = 100,
    bootstrap = True,
    n_jobs=-1, 
    random_state=42
)
bc_clf.fit(X_train,y_train)
pred = bc_clf.predict(X_test)
train_pred = bc_clf.predict(X_train)
print(bc_clf.__class__.__name__,'(Test): {:.3f}'.format(accuracy_score(y_test,pred)))
print(bc_clf.__class__.__name__,'(Train): {:.3f}'.format(accuracy_score(y_train,train_pred)))

BaggingClassifier (Test): 0.900
BaggingClassifier (Train): 0.938


In [42]:
# out of Bag Eveluation
bc_clf = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(random_state=42),
    n_estimators = 500,
    max_samples = 100,
    bootstrap = True,
    n_jobs=-1, 
    random_state=42,
    oob_score=True # Whether to use out-of-bag samples to estimate the generalization error.
)
bc_clf.fit(X_train,y_train)
print('Out of Bag Score {}'.format(bc_clf.oob_score_))

Out of Bag Score 0.9175


According to this oob evaluation, this BaggingClassifier is likely to achieve about 93.1% accuracy on the test set. We have already calculated above. Test Score it 0.9 which is smaller then Out of Bag Score.