# owenliang/introduction-to-machine-learning-with-python

Fetching contributors…
Cannot retrieve contributors at this time
524 lines (377 sloc) 14.7 KB

# 交叉验证

``````from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 创建一个模拟数据集
X, y = make_blobs(random_state=0)

# 将数据和标签划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # 将模型实例化，并用它来拟合训练集
logreg = LogisticRegression().fit(X_train, y_train)

# 在测试集上评估该模型
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))
``````
``````Test set score: 0.88
``````

## K折交叉

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 数据集

# 模型
logreg = LogisticRegression()

# K折交叉验证
scores = cross_val_score(logreg, iris.data, iris.target, cv=3)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
``````
``````Cross-validation scores: [0.96078431 0.92156863 0.95833333]
Average cross-validation score: 0.95
``````

## 分层K折交叉

K折交叉划分数据的方式是从头开始均分成K份，如果样本数据的分类分布不均匀，那么就会导致K折交叉策略失效，比如：

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 数据集
print("Iris labels:\n{}".format(iris.target))
``````
``````Iris labels:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 000000000000111111111111111111111111 1 111111111111111111111111122222222222 2 222222222222222222222222222222222222 2 2]
``````

sklearn会根据模型是回归还是分类决定使用标准K折还是分层K折，不需我们关心，只需要了解。

## 其他策略

K折还有其他策略，通过cv参数控制即可，比如：留1验证。

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit,cross_val_score

# 数据集

# 模型
logreg = LogisticRegression()

# 打乱划分交叉
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
``````
``````Cross-validation scores: [0.90666667 0.92       0.96       0.98666667 0.98666667 0.96
0.92       0.89333333 0.81333333 0.89333333]
Average cross-validation score: 0.92
``````

# 网格搜索

## �带交叉验证的网格搜索

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC

# 数据集

# 切分数据
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=0)

# 2种网格
param_grid = [
# 第1个网格
{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
# 第2个网格
{'kernel': ['linear'],'C': [0.001, 0.01, 0.1, 1, 10, 100]}
]
# 在2个网格中, 找到SVC模型的最佳参数, 这里cv=5表示每一种参数组合进行5折交叉验证计算得分
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

# fit找到最佳泛化的参数
grid_search.fit(X_train, y_train)

# 查看精度
print("泛化精度:", grid_search.score(X_test, y_test))

# 打印最佳参数
print("Best parameters: {}".format(grid_search.best_params_))
``````
``````泛化精度: 0.9736842105263158
Best parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
``````

## 交叉验证+网格搜索的嵌套

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC

# 数据集

# 2种网格
param_grid = [
# 第1个网格
{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
# 第2个网格
{'kernel': ['linear'],'C': [0.001, 0.01, 0.1, 1, 10, 100]}
]
# 在2个网格中, 找到SVC模型的最佳参数, 每一组参数进行5折评估
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

# 外层K折
scores = cross_val_score(grid_search, iris.data, iris.target, cv=5)

# 打印精度
print(scores)
``````

``````from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC

# 数据集

# 2种网格
param_grid = [
# 第1个网格
{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
# 第2个网格
{'kernel': ['linear'],'C': [0.001, 0.01, 0.1, 1, 10, 100]}
]
# 在2个网格中, 找到SVC模型的最佳参数, 每一组参数进行5折评估
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

# 外层K折
scores = cross_val_score(grid_search, iris.data, iris.target, cv=5)

# 打印精度
print(scores)
``````
``````[0.96666667 1.         0.9        0.96666667 1.        ]
``````

# 评估指标与评分

## 二分类指标

2分类一共有2种类型，一种称为正类(positive)，一种称为反类(negative)。

• TP：预测是正类，样本是正类
• FP：预测是正类，样本是反类
• TN：预测是反类，样本是反类
• FN：预测是反类，样本是正类
``````from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# 数据集
# 转换成2分类, 即目标数字是否等于9
y = digits.target == 9

# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)

# 模型
lr = LogisticRegression()
# 训练
lr.fit(X_train, y_train)
# 预测
y_pred = lr.predict(X_test)

# 混淆矩阵
print(confusion_matrix(y_test, y_pred))
``````
``````[[399   4]
[  7  40]]
``````

``````反类 		TN 				FP

预测为反类			预测为正类
``````

score()精度就是预测的正确率：

``````Accuracy = (TN+TP) / (TN+TP+FN+FP)
``````

``````Precision = TP / (TP + FP)
``````

``````Recall = TP / (TP + FN)
``````

``````F = 2 * precision * recall / (precision + recall)
``````
``````from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,f1_score

# 数据集
# 转换成2分类, 即目标数字是否等于9
y = digits.target == 9

# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)

# 模型
lr = LogisticRegression()
# 训练
lr.fit(X_train, y_train)
# 预测
y_pred = lr.predict(X_test)

# 混淆矩阵
print(confusion_matrix(y_test, y_pred))

# 打印f1-score
print(f1_score(y_test, y_pred))
``````
``````[[399   4]
[  7  40]]
0.8791208791208791
``````

``````from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.metrics import classification_report

# 数据集
# 转换成2分类, 即目标数字是否等于9
y = digits.target == 9

# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)

# 模型
lr = LogisticRegression()
# 训练
lr.fit(X_train, y_train)
# 预测
y_pred = lr.predict(X_test)

# 混淆矩阵
print(confusion_matrix(y_test, y_pred))

# 打印f1-score
print(classification_report(y_test, y_pred))
``````
``````[[399   4]
[  7  40]]
precision    recall  f1-score   support

False       0.98      0.99      0.99       403
True       0.91      0.85      0.88        47

micro avg       0.98      0.98      0.98       450
macro avg       0.95      0.92      0.93       450
weighted avg       0.98      0.98      0.98       450

``````

## 多分类指标

``````from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 数据集

# 切分
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target, random_state=0)

# 训练
lr = LogisticRegression().fit(X_train, y_train)
# 预测
pred = lr.predict(X_test)

# 精度
print("Accuracy: {:.3f}".format(lr.score(X_test, y_test)))
# 精确度、召回率、f1 指标
print(classification_report(pred, y_test))

``````
``````Accuracy: 0.953
precision    recall  f1-score   support

0       1.00      1.00      1.00        37
1       0.91      0.89      0.90        44
2       0.93      0.95      0.94        43
3       0.96      0.90      0.92        48
4       1.00      0.97      0.99        39
5       0.98      0.98      0.98        48
6       1.00      0.96      0.98        54
7       0.94      1.00      0.97        45
8       0.90      0.93      0.91        46
9       0.94      0.96      0.95        46

micro avg       0.95      0.95      0.95       450
macro avg       0.95      0.95      0.95       450
weighted avg       0.95      0.95      0.95       450
``````

## 回归指标

score底层使用的是R^2，它是评估回归模型的很好的指标。

## 模型选择中使用其他评估指标

``````from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 数据集

# 切分成2分类问题, 数字是否等于9
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target == 9, random_state=0)

# 2种网格
param_grid = {'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
# 在2个网格中, 找到SVC模型的最佳参数, 每一组参数进行3折评估, 使用f1-score作为评估依据
grid_search = GridSearchCV(SVC(), param_grid, cv=3, scoring='f1')

# 搜索最佳参数
grid_search.fit(X_train, y_train)

# 打印最佳参数
print(grid_search.best_params_)
# 打印最佳参数的f1-score
print(grid_search.best_score_)
# 打印在测试集上的各种指标
print(classification_report(grid_search.predict(X_test), y_test))

``````
``````{'gamma': 0.001}
0.9771729298313027
precision    recall  f1-score   support

False       1.00      0.99      1.00       405
True       0.94      0.98      0.96        45

micro avg       0.99      0.99      0.99       450
macro avg       0.97      0.99      0.98       450
weighted avg       0.99      0.99      0.99       450
``````

K折交叉验证也是一样的，可以指定K折输出的评估指标，默认是精度。

# 总结

• 交叉验证
• 网格搜索
• 评估指标

• 训练集：生成模型
• 验证集：搜索参数
• 测试集：模型评估

You can’t perform that action at this time.