# Ensemble Learning and Random Forests

## 1. Voting Classifiers

Create and train 2 voting classifiers hard and soft from 3 different models each and compare their accuracies with the voting model

In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x, y = make_moons(n_samples=5000, noise=0.5)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

hard_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)

for clf in (log_clf, rnd_clf, svm_clf, hard_voting_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(x_test)))

LogisticRegression 0.801
RandomForestClassifier 0.799
SVC 0.817
VotingClassifier 0.816


In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

soft_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)

for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
    clf.fit(x_train, y_train)
    print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(x_test)))

LogisticRegression 0.801
RandomForestClassifier 0.794
SVC 0.817
VotingClassifier 0.814


## 2. Bagging and Pasting

If a base model can estimape probabilities then the `BaggingClassifier` is soft voting, else hard voting

In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),  
    n_estimators=500,
    max_samples=100,
    bootstrap=True,  # True - bagging, False - pasting
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
accuracy_score(y_test, bag_clf.predict(x_test))

0.819

In [5]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=False,  # True - bagging, False - pasting
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
accuracy_score(y_test, bag_clf.predict(x_test))

0.813

**Out-of-Bag Evaluation**

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),  
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True  # <--
)
bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

0.82075

In [7]:
accuracy_score(y_test, bag_clf.predict(x_test))

0.813

In [8]:
bag_clf.oob_decision_function_  # probabilities of 0 and 1

array([[0.89795918, 0.10204082],
       [0.86815416, 0.13184584],
       [0.79674797, 0.20325203],
       ...,
       [0.11020408, 0.88979592],
       [0.0781893 , 0.9218107 ],
       [0.10330579, 0.89669421]])

## 3. Random Forests

A *Random Forest* is an ensemble of Decision Trees, generally trained via the bagging method. The Random Forest introduces extra randomness when growing trees; instead of searching for the very best feature when splitting a node, it searches for the best feature among a random subset of features. The Random Forest makes it easy to measure the relative importance of each feature.

In [9]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(x_train, y_train)

y_pred_rf = rnd_clf.predict(x_test)

Which is roughly equal to

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
    n_estimators=500, 
    max_samples=1.0,
    bootstrap=True,
    n_jobs=-1
)

**Feature Importance**

In [11]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])

for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.1030848996817452
sepal width (cm) 0.02460721441654559
petal length (cm) 0.4478197347687168
petal width (cm) 0.42448815113299243


# 4. Boosting

**Ada Boosting**

In [12]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm='SAMME.R',  # or SAMME
    learning_rate=0.5
)

ada_clf.fit(x_train, y_train)
accuracy_score(y_test, ada_clf.predict(x_test))

0.818

**Gradient Boosting**

Step by step

In [13]:
import numpy as np

np.random.seed(42)
x = np.random.rand(100, 1) - 0.5
y = 3*x[:, 0]**2 + 0.05 * np.random.randn(100)

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(x, y)

In [15]:
y2 = y - tree_reg1.predict(x)  # residual error

tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(x, y2)

In [16]:
y3 = y - tree_reg2.predict(x)  # residual error

tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(x, y3)

In [17]:
x_new = np.array([[0.8]])

In [18]:
y_pred = sum(tree.predict(x_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([1.08692075])

Using `Scikit-learn`

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(x, y)
gbrt.predict(x_new)

array([0.75026781])

In order to find the optimal number of trees, we can use early stopping

In [20]:
from sklearn.metrics import mean_squared_error

x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(x_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(x_val)]
bst_n_estimators = np.argmin(errors)
bst_n_estimators

55

**XGBoost**

In [21]:
import xgboost

In [22]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(x_train, y_train)
y_pred = xgb_reg.predict(x_val)

With early stopping

In [23]:
xgb_reg.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(x_val)

[0]	validation_0-rmse:0.22834
[1]	validation_0-rmse:0.16224
[2]	validation_0-rmse:0.11843
[3]	validation_0-rmse:0.08760
[4]	validation_0-rmse:0.06848
[5]	validation_0-rmse:0.05709
[6]	validation_0-rmse:0.05297
[7]	validation_0-rmse:0.05129
[8]	validation_0-rmse:0.05155
[9]	validation_0-rmse:0.05211




# 5. Stacking

`Scikit-learn` doesn't support stacking directly. Use your own implementations or open sourse implementations such as  `DESlib`