# 集成学习
合并了一组分类器的预测，得到比单一分类器更好的预测结果。这组分类器就叫集成。

## 投票分类
整合每一个分类器的预测然后经过投票去预测分类。这种分类器就叫做硬投票分类器。

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(
            estimators=[
                ('lr',log_clf),
                ('rf',rnd_clf),
                ('svc',svm_clf),
            ],voting='hard'
        )
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

查看准确率：

In [16]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8333333333333334
RandomForestClassifier 0.9333333333333333
SVC 0.9333333333333333
VotingClassifier 0.9666666666666667




## Bagging和Pasting
对每一个分类器都使用相同的训练算法，但是在不同的训练集上去训练它们。有放回采样被称为装袋。无放回采样称为粘贴。

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=50, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)

## Out-of-Bag评价
- 对于Bagging来说，一些实例被一些分类器重复采样，但其他的有可能不会被采样。
- 因为在训练中分类器从来没有看到过oob实例，所以它可以在这些实例上进行评估，而不需要单独的验证集或极爱差验证。

In [20]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9285714285714286

In [21]:
# 测试集测试
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9666666666666667

In [22]:
bag_clf.oob_decision_function_

array([[0.08426966, 0.91573034],
       [0.85245902, 0.14754098],
       [0.90640394, 0.09359606],
       [0.35869565, 0.64130435],
       [0.79444444, 0.20555556],
       [0.44252874, 0.55747126],
       [0.05376344, 0.94623656],
       [0.02094241, 0.97905759],
       [0.01111111, 0.98888889],
       [0.09895833, 0.90104167],
       [0.00595238, 0.99404762],
       [0.04255319, 0.95744681],
       [0.97740113, 0.02259887],
       [0.98870056, 0.01129944],
       [0.9673913 , 0.0326087 ],
       [0.80924855, 0.19075145],
       [0.00564972, 0.99435028],
       [0.03529412, 0.96470588],
       [0.18617021, 0.81382979],
       [0.98351648, 0.01648352],
       [0.67403315, 0.32596685],
       [0.89156627, 0.10843373],
       [0.13265306, 0.86734694],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.01081081, 0.98918919],
       [0.2122905 , 0.7877095 ],
       [0.96774194, 0.03225806],
       [0.93157895, 0.06842105],
       [0.09137056, 0.90862944],
       [0.

## 随机贴片与随机子空间
 
## 随机森林
随机森林是决策树的一种集成，通常是通过bagging方法进行训练，通常用**max_samples**设置为训练集的大小。

In [23]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [24]:
y_pred_rf

array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1])

In [25]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter="random", max_leaf_nodes=16), n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

## 极端随机树

## 特征重要度

In [26]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10709267679137877
sepal width (cm) 0.02566606261663462
petal length (cm) 0.4236824020150443
petal width (cm) 0.44355885857694227


## Adaboost

In [27]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

## 梯度提升
梯度提升也是通过向集成中逐步增加分类器运行的，每一个分类器都修正之前的分类结果。这个方法是去使用信德分类器去拟合前面分类器预测的残差。

In [28]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [29]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [31]:
y3 = y2 - tree_reg1.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [None]:
y_pred = sum( tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))