# 投票分類器

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons()[0], make_moons()[1]
print("X.shape", X.shape)
print("y.shape", y.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svc_clf = SVC(probability=True)

hard_voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rc", rnd_clf), ("svc", svc_clf)], voting="hard")
hard_voting_clf.fit(X_train, y_train)

soft_voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rc", rnd_clf), ("svc", svc_clf)], voting="soft")
soft_voting_clf.fit(X_train, y_train)

X.shape (100, 2)
y.shape (100,)


VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rc', RandomF...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [18]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svc_clf, hard_voting_clf, soft_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))

LogisticRegression 0.4
RandomForestClassifier 0.45
SVC 0.4
VotingClassifier 0.4
VotingClassifier 0.4


  if diff:
  if diff:


# バギングとペースティング

In [34]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=0.9, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.predict(X_val)


array([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [35]:
bag_clf.predict_proba(X_val)

array([[0.922, 0.078],
       [0.84 , 0.16 ],
       [0.868, 0.132],
       [0.31 , 0.69 ],
       [0.418, 0.582],
       [0.032, 0.968],
       [0.574, 0.426],
       [0.216, 0.784],
       [0.446, 0.554],
       [0.542, 0.458],
       [0.51 , 0.49 ],
       [0.256, 0.744],
       [0.292, 0.708],
       [0.48 , 0.52 ],
       [0.064, 0.936],
       [0.328, 0.672],
       [0.266, 0.734],
       [0.034, 0.966],
       [0.106, 0.894],
       [0.948, 0.052]])

In [36]:
bag_clf.oob_score_

0.525

In [37]:
from sklearn.metrics import accuracy_score

y_pred = bag_clf.predict(X_val)
accuracy_score(y_pred, y_val)

0.4

In [38]:
bag_clf.oob_decision_function_

array([[0.66829268, 0.33170732],
       [0.55188679, 0.44811321],
       [0.5879397 , 0.4120603 ],
       [0.73023256, 0.26976744],
       [0.56923077, 0.43076923],
       [0.43661972, 0.56338028],
       [0.19672131, 0.80327869],
       [0.67357513, 0.32642487],
       [0.15492958, 0.84507042],
       [0.63888889, 0.36111111],
       [0.07614213, 0.92385787],
       [0.49302326, 0.50697674],
       [0.71634615, 0.28365385],
       [0.24056604, 0.75943396],
       [0.75274725, 0.24725275],
       [0.52995392, 0.47004608],
       [0.53365385, 0.46634615],
       [0.90731707, 0.09268293],
       [0.76165803, 0.23834197],
       [0.29896907, 0.70103093],
       [0.39790576, 0.60209424],
       [0.82901554, 0.17098446],
       [0.83253589, 0.16746411],
       [0.55882353, 0.44117647],
       [0.73796791, 0.26203209],
       [0.33035714, 0.66964286],
       [0.73809524, 0.26190476],
       [0.09734513, 0.90265487],
       [0.79792746, 0.20207254],
       [0.76168224, 0.23831776],
       [0.

# ランダムフォレスト

In [48]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X, y)
rnd_clf.predict(X_val)

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1])

In [49]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)
bag_clf.fit(X, y)
bag_clf.predict(X_val)

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1])

## 特徴量の重要度

In [51]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
    

sepal length (cm) 0.10250168588778182
sepal width (cm) 0.02654776376963677
petal length (cm) 0.42821605417580216
petal width (cm) 0.4427344961667791


# ブースティング

## アダブースト(AdaBoost)

In [57]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
ada_clf.predict(X_val)

array([0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1])

## 勾配ブースティング

In [64]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.6794938 , 0.93403926, 1.12051944, 1.12051944, 0.6794938 ,
       1.12051944, 1.12051944, 1.12051944, 1.12051944, 0.90108471,
       1.12051944, 1.12051944, 0.6794938 , 1.12051944, 0.6794938 ,
       0.90108471, 0.90108471, 0.93403926, 1.12051944, 0.66666667,
       1.12051944, 0.93403926, 1.12051944, 1.12051944, 0.93403926,
       1.12051944, 0.        , 1.12051944, 0.90108471, 0.90108471,
       1.12051944, 0.        , 2.        , 2.        , 1.12051944,
       1.12051944, 1.12051944, 0.6794938 , 0.90108471, 0.93403926,
       1.12051944, 0.93403926, 1.12051944, 1.12051944, 1.12051944,
       0.6794938 , 2.        , 0.90108471, 0.90108471, 0.6794938 ,
       1.12051944, 1.12051944, 0.90108471, 0.90108471, 0.90108471,
       0.90108471, 1.12051944, 1.12051944, 0.90108471, 0.90108471,
       0.90108471, 0.90108471, 0.93403926, 0.90108471, 0.90108471,
       1.12051944, 1.12051944, 2.        , 2.        , 0.90108471,
       0.90108471, 1.12051944, 1.12051944, 0.6794938 , 1.12051

In [65]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt_reg = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt_reg.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [67]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt_reg = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt_reg.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt_reg.staged_predict(X_val)]
best_n_estimator = np.argmin(errors)

gbrt_reg = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimator)
gbrt_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [69]:
gbrt_reg = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimator in range(1, 120):
    gbrt_reg.n_estimators = n_estimator
    gbrt_reg.fit(X_train, y_train)
    y_pred = gbrt_reg.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        
    if error_going_up == 5:
        break

gbrt_reg

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=7, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=True)