In [11]:
# create and train a voiting classifier in Sikit-Learn composed of randomForsest, VOtingClassifier, and Logisitic Regression

from sklearn import datasets
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X, y = datasets.make_moons(n_samples=500, noise=.3, random_state=42);
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ("lr", log_clf),
        ("rd", rnd_clf),
        ("sv", svm_clf)
    ],
    voting="hard"
);

voting_clf.fit(X_train, y_train);

In [12]:
# view the accuracy of each model
from sklearn.metrics import accuracy_score;

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train);
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred));

LogisticRegression 0.8666666666666667
RandomForestClassifier 0.9090909090909091
SVC 0.9090909090909091
VotingClassifier 0.9151515151515152


In [15]:
#Train an ensemble of 500 Decision Tree classifiers: each is trained on 100 training instances randomly sampled from the training set with replacement.

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1
);

bag_clf.fit(X_train, y_train);
y_pred = bag_clf.predict(X_test);
print("Accuracy:", accuracy_score(y_test, y_pred));

Accuracy: 0.9272727272727272


In [16]:
# Train a bagging classifer with out-of-bag evaluation

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    oob_score=True
);
bag_clf.fit(X_train, y_train);
print("OOB Score:", bag_clf.oob_score_)

OOB Score: 0.9164179104477612


In [17]:
y_pred = bag_clf.predict(X_test);
print("Prediction:", accuracy_score(y_test, y_pred))

Prediction: 0.9212121212121213


In [21]:
# Train a random forest classifier (Baggling classifier with a decision tree classifier)
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1);
rnd_clf.fit(X_train, y_train);
y_pred_rf = rnd_clf.predict(X_test);
print("Accuracy:", accuracy_score(y_test, y_pred_rf));

'''
similar bagging classifier as the RandomForestClassifier above

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(
        splitter="random",
        max_leaf_nodes=16
    ),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=True,
    n_jobs=-1
);

'''

Accuracy: 0.9212121212121213


'\nsimilar bagging classifier as the RandomForestClassifier above\n\nbag_clf = BaggingClassifier(\n    DecisionTreeClassifier(\n        splitter="random",\n        max_leaf_nodes=16\n    ),\n    n_estimators=500,\n    max_samples=1.0,\n    bootstrap=True,\n    n_jobs=-1\n);\n\n'

In [23]:
# define the importance of each feature using a RandomForestClassifier

from sklearn.datasets import load_iris

iris = load_iris();
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1);
rnd_clf.fit(iris["data"], iris["target"]);
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
  print(name, score)

sepal length (cm) 0.09571711034942516
sepal width (cm) 0.023808353529291677
petal length (cm) 0.45641312029477576
petal width (cm) 0.42406141582650747


In [25]:
# train an AdaBoost classifier based on 200 decision stumps.

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5

);
ada_clf.fit(X_train, y_train);
y_pred = ada_clf.predict(X_test);
print("Accuracy:", accuracy_score(y_test, y_pred));

Accuracy: 0.896969696969697


In [31]:
#generate some random data
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

# Building 3 decision tree regressors that improve based on each others predictions (similar to Gradient descent)
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2);
tree_reg1.fit(X, y);

y2 = y - tree_reg1.predict(X);
tree_reg2 = DecisionTreeRegressor(max_depth=2);
tree_reg2.fit(X, y2);

y3 = y2 - tree_reg1.predict(X);
tree_reg3 = DecisionTreeRegressor(max_depth=2);
tree_reg3.fit(X, y3);

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
#print("Accuracy:", accuracy_score(y_pred, y))

In [33]:
# build the same thing, but using the GradientBoostRegressor ensemble
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0);
gbrt.fit(X, y);

In [37]:
# Train a GBRT ensemble with 120 trees, then measure the validatin error at each stage of training to find the optimal number of trees, and finally train another GBRT ensemble using the optimal number of trees

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y);

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120);
gbrt.fit(X_train, y_train);

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_esimtators = np.argmin(errors) + 1
print("Best:", bst_n_esimtators);

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_esimtators);
gbrt_best.fit(X_train, y_train);

Best: 87


In [39]:
# do the same thin by stop training early when the validation error does not improve for five iterations in a row

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True);

min_val_error = float("inf");
error_going_up = 0;
for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators;
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_val);
  val_error = mean_squared_error(y_val, y_pred);
  if val_error < min_val_error:
    min_val_error = val_error;
    error_going_up = 0;
  else:
    error_going_up += 1
    if error_going_up == 5:
      print("Prediction:", y_pred)
      print("Val error:", val_error)
      break # early stopping


Prediction: [ 0.58019483 -0.0211741  -0.0211741   0.12181154  0.43810588  0.02957955
  0.57717787  0.26424075  0.39928288  0.36148811 -0.0211741   0.34701674
  0.18350373  0.15170877  0.63186196  0.06452096 -0.0211741   0.477411
  0.68067178  0.477411    0.5994077   0.71977728  0.57991325 -0.0211741
  0.26424075]
Val error: 0.002367954226391026


In [41]:
# Implement the XG Boost regressor

import xgboost

xgb_reg = xgboost.XGBRegressor();
xgb_reg.fit(X_train, y_train);

y_pred = xgb_reg.predict(X_val);

In [43]:
# Implement XG Boost regressor, but stop early

xgb_reg.fit(
    X_train, 
    y_train, 
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=2
);
y_pred = xgb_reg.predict(X_val);



[0]	validation_0-rmse:0.23584
[1]	validation_0-rmse:0.17932
[2]	validation_0-rmse:0.13658
[3]	validation_0-rmse:0.10759
[4]	validation_0-rmse:0.08705
[5]	validation_0-rmse:0.07359
[6]	validation_0-rmse:0.06434
[7]	validation_0-rmse:0.05872
[8]	validation_0-rmse:0.05460
[9]	validation_0-rmse:0.05312
[10]	validation_0-rmse:0.05204
[11]	validation_0-rmse:0.05148
[12]	validation_0-rmse:0.05121
[13]	validation_0-rmse:0.05152
[14]	validation_0-rmse:0.05147
