In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state= 42)
X_train, X_test, y_train, y_test = train_test_split(X, y)

Ensemble Learning: Voting Classifiers

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver = 'liblinear')
rnd_clf = RandomForestClassifier(n_estimators= 10)
svm_clf = SVC(gamma = 'auto')

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),
                 ('rf', rnd_clf),
                 ('svc', svm_clf)],
    voting = 'hard')

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(solver='liblinear')),
                             ('rf', RandomForestClassifier(n_estimators=10)),
                             ('svc', SVC(gamma='auto'))])

In [25]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.792
RandomForestClassifier 0.896
SVC 0.92
VotingClassifier 0.896


In [22]:
#soft prediction
log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.792
RandomForestClassifier 0.88
SVC 0.92
VotingClassifier 0.872


Bagging and Pasting

In [27]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators= 500,
    max_samples= 100, bootstrap= True, n_jobs = -1,
    oob_score= True)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

bag_clf.oob_score_

0.9146666666666666

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.896

In [29]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500,
                                max_leaf_nodes= 16,
                                n_jobs= -1)

rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

Feature Importance

In [30]:
from sklearn.datasets import load_iris

iris = load_iris()

rnd_clf = RandomForestClassifier(n_estimators= 500, n_jobs = -1)
rnd_clf.fit(iris['data'], iris['target'])

for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10428094855987859
sepal width (cm) 0.026415879708222866
petal length (cm) 0.445503882177516
petal width (cm) 0.42379928955438245


### Boosting

AdaBoost

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm= "SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

Gradient Boosting

In [4]:
# Random Qudratic training set
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [5]:
#first decision tree
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X,y)

#second tree on residuals
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)

#second tree on the residuals
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)



X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.75026781])

same in sklearn

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2,
                                n_estimators= 3,
                                learning_rate= 1.0)
gbrt.fit(X,y)
gbrt.predict(X_new)

array([0.75026781])

Running an iterated prediction over at each level and returning optimal number of estimator based validation set

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, ypred)
         for ypred in gbrt.staged_predict(X_val)]
best_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2,
                                     n_estimators= best_n_estimators)
gbrt_best = (X_train, y_train)

Early stopping when validation erron doesnot imporve for five iteratins in a row

In [23]:
gbrt = GradientBoostingRegressor(max_depth= 2,
                                warm_start= True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

Stochastic Gradient Boosting

at each instance tree is fitted on a random subset of training set

speeds up

special package xtreme gradient boosting

In [25]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2
Note: you may need to restart the kernel to use updated packages.


In [27]:
import xgboost as xgb
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_val)

In [30]:
#with early stopping
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train,
           eval_set = [(X_val, y_val)], 
           early_stopping_rounds = 2)

y_pred =xgb_reg.predict(X_val)
y_pred

[0]	validation_0-rmse:0.27669
[1]	validation_0-rmse:0.20147
[2]	validation_0-rmse:0.14687
[3]	validation_0-rmse:0.11017
[4]	validation_0-rmse:0.08607
[5]	validation_0-rmse:0.07183
[6]	validation_0-rmse:0.06386
[7]	validation_0-rmse:0.06020
[8]	validation_0-rmse:0.05896
[9]	validation_0-rmse:0.05861
[10]	validation_0-rmse:0.05889
[11]	validation_0-rmse:0.05848
[12]	validation_0-rmse:0.05867


array([ 0.6429717 ,  0.03401398,  0.03401398,  0.03401398, -0.01044697,
        0.38612452,  0.28688428,  0.58283085, -0.01044697, -0.01044697,
        0.15922701, -0.00103129,  0.6037855 , -0.00103129,  0.03401398,
        0.12037362,  0.3806873 ,  0.10048291,  0.18836705, -0.01044697,
       -0.01044697,  0.4506942 ,  0.3666826 , -0.01044697,  0.556862  ],
      dtype=float32)