In [16]:
# Ensemble libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
np.random.seed(42)

In [12]:
# Import moon data
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x_moon, y_moon = make_moons(n_samples=500, noise=0.3, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x_moon, y_moon, test_size=0.25, random_state=42)

In [21]:
# Ensemble
log_clf = LogisticRegression(solver='lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma='scale', probability=True)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini'...
                                        

In [22]:
# Evaluation
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


In [23]:
# Bagging in sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

In [25]:
accuracy_score(y_pred, y_test)

0.928

In [27]:
# Out of bag evaluation
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1,
    oob_score=True)
bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

0.928

In [30]:
bag_clf.oob_decision_function_[:10]

array([[0.34146341, 0.65853659],
       [0.40789474, 0.59210526],
       [0.9974026 , 0.0025974 ],
       [0.01507538, 0.98492462],
       [0.02544529, 0.97455471],
       [0.1031746 , 0.8968254 ],
       [0.39572193, 0.60427807],
       [0.05943152, 0.94056848],
       [0.94683544, 0.05316456],
       [0.84210526, 0.15789474]])

In [33]:
# Random forest in sklearn
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(x_train, y_train)
y_pred_rf = rnd_clf.predict(x_test)

In [34]:
accuracy_score(y_pred_rf, y_test)

0.92

In [35]:
# Extremely Randomized Trees in sklearn
from sklearn.ensemble import ExtraTreesClassifier

xrnd_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
xrnd_clf.fit(x_train, y_train)
y_pred_xrf = xrnd_clf.predict(x_test)

In [36]:
accuracy_score(y_pred_xrf, y_test)

0.92

In [37]:
# Feature importance

In [46]:
# Import iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10727923617166575
sepal width (cm) 0.02523032807704551
petal length (cm) 0.4199219968963414
petal width (cm) 0.4475684388549473


In [47]:
# Boosting

In [58]:
# AdaBoost in sklearn
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=1200,
    algorithm='SAMME.R', learning_rate=0.1)
ada_clf.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [59]:
ada_clf_pred = ada_clf.predict(x_test)
accuracy_score(y_test, ada_clf_pred)

0.912

In [60]:
# Example dataset generation
x = np.random.rand(100, 1) - 0.5
y = 3*x[:, 0]**2 + 0.05*np.random.rand(100)

In [61]:
# Gradient Boosting in sklearn
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1)
gbrt.fit(x, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [63]:
# GBRT with hacky 'early' stopping in sklearn
from sklearn.metrics import mean_squared_error

x_train, x_val, y_train, y_val = train_test_split(x, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(x_train, y_train)

# Calculate error at each stage of training/each tree
errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(x_val)]

# Best tree number
best_n_estimators = np.argmin(errors) + 1

# Fit with best n_estimators
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=68,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [65]:
# GBRT with 'early' early stopping in sklearn
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True,
                                subsample = 0.25) # Stochastic GBT

min_val_error = float('inf')
error_going_up = 0  # count of iterations with error going up

# Train for 120 iterations
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(x_train, y_train)
    y_pred = gbrt.predict(x_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:    # Stop if error is going up after 5 iterations
            break

In [66]:
# GBRT with xgboost
import xgboost

xgb_reg = xgboost.SGBRegressor()
xgb_reg.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=5)
y_pred = xgb_reg.pred(x_val)

ModuleNotFoundError: No module named 'xgboost'