<a href="https://colab.research.google.com/github/mango766/ml_learning/blob/main/%E5%86%B3%E7%AD%96%E6%A0%91%E4%BD%9C%E4%B8%9A1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 使用卫星训练集网格搜索训练随机树、使用随机森林进行预测

In [17]:
import numpy as np

In [2]:
from sklearn.datasets import make_moons
make_moons

<function sklearn.datasets._samples_generator.make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None)>

In [3]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...],
                         'min_samples_split': [2, 3, 4]},
             verbose=1)

In [8]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)

In [9]:
y_pred = grid_search_cv.predict(X_test)

使用交叉验证，发现在训练集上拟合情况良好

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(grid_search_cv, X_train, y_train, cv=3, scoring='accuracy')

Fitting 3 folds for each of 294 candidates, totalling 882 fits
Fitting 3 folds for each of 294 candidates, totalling 882 fits
Fitting 3 folds for each of 294 candidates, totalling 882 fits


array([0.85189351, 0.85751781, 0.85708927])

In [13]:
from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

准确率为87%左右

在1000个测试集上使用之前得出的最佳参数进行评估。

In [14]:
from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances = 100#实例数
mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)

In [15]:
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [18]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]#使用之前最优参数，克隆到新模型中

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print(accuracy_scores)
print(np.mean(accuracy_scores))

[0.7985, 0.836, 0.8065, 0.842, 0.7935, 0.8315, 0.7825, 0.7935, 0.8085, 0.829, 0.794, 0.78, 0.7995, 0.8175, 0.8035, 0.846, 0.8235, 0.811, 0.825, 0.834, 0.8345, 0.82, 0.792, 0.8395, 0.8155, 0.7735, 0.833, 0.8055, 0.823, 0.8165, 0.777, 0.831, 0.815, 0.769, 0.793, 0.805, 0.7905, 0.772, 0.816, 0.8375, 0.772, 0.7455, 0.7985, 0.7915, 0.807, 0.803, 0.82, 0.785, 0.79, 0.7975, 0.8185, 0.809, 0.7875, 0.809, 0.781, 0.8145, 0.8255, 0.8215, 0.7685, 0.7555, 0.774, 0.8035, 0.767, 0.8085, 0.8205, 0.786, 0.824, 0.7815, 0.838, 0.7895, 0.8215, 0.803, 0.8305, 0.833, 0.8245, 0.769, 0.831, 0.801, 0.831, 0.789, 0.8085, 0.788, 0.848, 0.824, 0.7385, 0.743, 0.808, 0.8055, 0.837, 0.745, 0.7995, 0.8105, 0.7955, 0.793, 0.8075, 0.792, 0.816, 0.809, 0.821, 0.79, 0.803, 0.8, 0.8115, 0.816, 0.838, 0.801, 0.8, 0.803, 0.806, 0.8235, 0.8095, 0.854, 0.807, 0.84, 0.809, 0.796, 0.81, 0.8015, 0.7955, 0.7915, 0.803, 0.7495, 0.8015, 0.814, 0.819, 0.82, 0.844, 0.804, 0.758, 0.8005, 0.779, 0.788, 0.7925, 0.78, 0.794, 0.797, 0.784

使用随机森林，1000个决策树

In [20]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

保留次数最频繁的预测

In [21]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [22]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.872