# Ensemble Learning and Random Forests
ensemble learning is very powerful provided the types of models used to create the predictions make different type of error the resulting ensemble result will be more accurate. Therefore it is very important to use very different models. 

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

log_clf = LogisticRegression()
Forest_clf = RandomForestClassifier()
svc_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('log', log_clf), ('forest', Forest_clf), ('svc', svc_clf)],
    voting='hard'
)

In [24]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('log', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('forest', Ran...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [25]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, Forest_clf, svc_clf, voting_clf):
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(X_test)))

('LogisticRegression', 0.8515)
('RandomForestClassifier', 0.9)
('SVC', 0.9075)
('VotingClassifier', 0.9045)


if all of the models in the VotingClassifier contain predict_proba methods than you can change the voting to 'soft' and can access the predict_proba for the ensemble. To do this with the current code above you would need to add a hyperparameter to the SVC class as it does not offer probabilities by default.  
  
## Bagging and Pasting
one way to get a set of diverse classifiers is to use very different algorithms. Another is to use very different training algorithms, Alternatively you could train each of them on a different subset of the training data. When each of the classifiers is trained on a subset of the training data but that training data is replaced the process is known as ***bagging***. when it occurs without replacement it is called ***pasting***.
<br/>
Each individual predictor has higher bias when trained on a subset of the training set but aggregation reduces both the bias and varience. You can expect the bias of an ensemble of classifiers to be roughly the same as a single predictor but less varience. 

as each different model is trained on a different subset and use different models there is potential for parallel computation. This is why bagging and pasting are popular methods.  
  
Sklearn offers a simple API for bagging and apsting with aggingclassifier and BaggingRegression for regression. Should you watn to use pasting instead of backing you set bootstrap=False. The n_jobs parameter tells sklearn the number of CPU cores to use. -1  tells sklearn to use all available cores.

In [33]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
y_pred

array([0, 0, 1, ..., 1, 1, 0])

the BaggingClassifer automatically performs soft voting instead of hard voting if the base classifier has a predict_proba() method.  
  
Bootstrapping introduces a bit more diversity in the subsets that each predictor is trained on, so bagging ends up with a slightly higher bias than pasting but also varience reduced. Bagging usually produces a better model.

## Out of bag Evaluation
When using Bagging (with replacement) each individual predictor will not see all of the trainin instances. therefore to evaluate the model you can evaluate each individual predictor on the out of bag (oob) instances and average the scores of all the predictors. to do this in sklearn you can set oob_score=True.

In [35]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    bootstrap=True, n_jobs=-1, oob_score=True)

In [36]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [37]:
bag_clf.oob_score_ #so the bag classifier is expected to produce 89% accuracy on the test set. Lets see.

0.89475

In [38]:
from sklearn.metrics import accuracy_score
pred = bag_clf.predict(X_test)
accuracy_score(y_test, pred) #hmmm actually better.

0.8985

# Random Patches and Random Subspaces
instead of using the above bagging example for decision trees you can just use the RandomForestClassifier as follows

In [40]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

With Random Forest Classifiers the Features chosen to split the nodes will be of more importance and the surface than deeper down. It is therefore possible to get feature importance. 

In [42]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

('sepal length (cm)', 0.09209511170083773)
('sepal width (cm)', 0.024340191663867028)
('petal length (cm)', 0.44474828346685946)
('petal width (cm)', 0.43881641316843595)


# Boosting
Boosting referes to a method where you train the models sequentially each one trying to reduce the error of the previous. There are many boosting metods available but AdaBoost and Gradient Boosting are the best.  
  
## AdaBoost
for the prdictor to correct the errors of its predecessor it must pay attention to the errors the predecessor made focusing more on them. after each validation the relative errors of the missclassified training instances is increased and then a second classifier is trained using the updated weights.  

To make predictions the model takes the predictions of all of the individual predictors and multiplies them by their relative weights. 

Sklearn supports and uses a multicalss version of AdaBoost called SAMME . when just using two calsses is the equivalent of Adaboost

In [46]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

# Gradient Boosting
Gradient Boosting works by sequentially adding predictors to an ensemble, each one correcting its predecessor. Unlike Ada however Gradient doesnt tweak the instance weights it trys to fit the new predictor to the residual errors made by the previous predictor.

In [51]:
X_train.shape, y_train.shape

((8000, 2), (8000,))

In [53]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)

y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)

y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [54]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

***To find the perfect number of trees it is optimal to use early stopping. A simple simple way to implement is with the staged_predict() metod:***

In [57]:
import numpy as np 
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt= GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=119, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

the above code trains the whole model and then picks the most accurate stage you can train the model in stages and stop training when it starts to overfit. 

In [66]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 500):
    gbrt.n_estimators = n_estimators 
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else: 
        error_going_up += 1 
        if error_going_up == 5:
            break

In [67]:
n_estimators

215

# Stacking
short from stacked generalization. all of the models in the ensembles makes tpredictions and then a final predictor called a blender takes these predictions and makes the final prediction. 
if you are just using one layer of stacking you split your training data in two. 
you train your first layer of predictors with one set. then use the predictors to make predictions of the other set. these predictions act as inputs to your blender model. 

Sci-kit learn does not support stacking but it would be simple enough to implement your own version. 