In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [33]:
X, y = make_moons(noise=0.30, random_state=42, n_samples=500)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Using various algos and let them vote:

In [34]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

The voting classifier will predict the class that gets the most votes ('hard' voting):

In [35]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')

In [36]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.888
SVC 0.896
VotingClassifier 0.896


Predicting class with highest proba averaged over all individual classifiers ('soft' voting):

In [37]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [38]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='soft')

In [39]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.896
VotingClassifier 0.912


## Using the same algo on different subsets of data: bagging

In [40]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [42]:
# 'n_jobs=-1' to use all availble CPUs in parallel:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)

In [43]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [45]:
y_pred = bag_clf.predict(X_test)

In [46]:
accuracy_score(y_test, y_pred)

0.912

## Random Forests

Instead of creating a bagging classifier and feeding it DecisionTrees, let's create a RandomForestClassifier:

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [50]:
y_pred_rf = rnd_clf.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred)

0.912

## Boosting

Train predictors sequentially, each trying to correct its predecessor.

AdaBoost: first train a predictor (such as DecisionTree) and then increase the weights of misclassified examplesa at every iteration, and so on.

GradientBoosting: first train a predictor (DT) and then fit the next classifier to the residual errors made by the previous one.

In [52]:
from sklearn.ensemble import GradientBoostingRegressor

In [53]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [55]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

The famous XGBoost:

In [56]:
import xgboost

In [57]:
xgb_reg = xgboost.XGBRegressor()

In [58]:
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

Adjust optimal number of trees automatically by using Early Stopping with XGBoost:

In [61]:
xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.41424
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.37118
[2]	validation_0-rmse:0.34995
[3]	validation_0-rmse:0.34547
[4]	validation_0-rmse:0.34522
[5]	validation_0-rmse:0.34573
[6]	validation_0-rmse:0.34921
Stopping. Best iteration:
[4]	validation_0-rmse:0.34522



## Stacking

Firstly, split the training set in 2 subsets.
Then use first subset to train the predictor (first layer) and use the second subset to evaluate predictions.
Then use all predictions to generate a new training set (blending trainset) and keep the same targets.
Then train a predictor on the blending trainset: it will make predictions based on first layer predictions.