In [41]:
# 交叉验证常见的方法汇总

import numpy as np
from sklearn.model_selection import KFold,RepeatedKFold,LeaveOneOut,LeavePOut,ShuffleSplit
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [26]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])

In [32]:
# 1、留出法：利用train_test_split切分的就是留出法

In [23]:
# 2：1次K折交叉验证
kfold = KFold(n_splits = 2)

for train_index, test_index in kfold.split(X):
    print(X[train_index]) # 自动分割训练集
    print(X[test_index])

[[5 6]
 [7 8]]
[[1 2]
 [3 4]]
[[1 2]
 [3 4]]
[[5 6]
 [7 8]]


In [27]:
# 3：N次K折交叉验证
repeatKfold = RepeatedKFold(n_splits=2, n_repeats=2, random_state=0)

for train_index, test_index in repeatKfold.split(X):
    print(X[train_index])
    print(X[test_index])

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]
[[5 6]
 [7 8]]
[[1 2]
 [3 4]]
[[3 4]
 [7 8]]
[[1 2]
 [5 6]]
[[1 2]
 [5 6]]
[[3 4]
 [7 8]]


In [29]:
# 4：留一法，每次只有一个测试集数据

lo = LeaveOneOut()

for train_index, test_index in lo.split(X):
    print(X[test_index])

[[1 2]]
[[3 4]]
[[5 6]]
[[7 8]]


In [30]:
# 5：留p法，每次只有p个测试集数据

lp = LeavePOut(p=3)

for train_index, test_index in lp.split(X):
    print(X[test_index])

[[1 2]
 [3 4]
 [5 6]]
[[1 2]
 [3 4]
 [7 8]]
[[1 2]
 [5 6]
 [7 8]]
[[3 4]
 [5 6]
 [7 8]]


In [35]:
# 6：随机分配：随机的把数据打乱，没有指定random_state则每次都是随机的

sf = ShuffleSplit(n_splits=2, random_state=0, test_size=0.25)

for train_index, test_index in sf.split(X):
    print(X[train_index])

[[7 8]
 [3 4]
 [1 2]]
[[5 6]
 [3 4]
 [7 8]]


In [51]:
# 利用cross_val_score检查模型对数据集进行交叉验证的预测准确性

iris = load_iris()

lr = LogisticRegression()
lr_model = lr.fit(train_x, train_y)

cross_val_score(lr_model, iris.data, iris.target, cv=5)
# 当cv的取值为整数的时候，使用(Stratified)KFold方法，整数为k，输出k次检验后测试集的得分

# my_cv = ShuffleSplit(n_splits=2, random_state=0, test_size=0.25)
# cross_val_score(lr_model, test_x, test_y, cv = my_cv)
# cv 可自己设置一个交叉验证的生成器或迭代器

array([ 1.        ,  0.96666667,  0.93333333,  0.9       ,  1.        ])

In [58]:
# 利用cross_validate来进行模型的性能度量，
# 它会返回一个字典来看模型的性能如何的，字典的key为：dict_keys(['fit_time', 'score_time', 'test_score', 'train_score'])
# 表示的是模型的训练时间，测试时间，测试评分和训练评分。

scoring = ['precision_macro', 'recall_macro']

# scores = cross_validate(lr_model, iris.data, iris.target, scoring=scoring,cv=5, return_train_score=False)
# 可配置scoring来指定输出的评估项

scores = cross_validate(lr_model, iris.data, iris.target ,cv=5)
scores

{'fit_time': array([ 0.00100017,  0.00100017,  0.00099993,  0.00099993,  0.00099993]),
 'score_time': array([ 0.00099993,  0.00099993,  0.00100017,  0.        ,  0.00099993]),
 'test_score': array([ 1.        ,  0.96666667,  0.93333333,  0.9       ,  1.        ]),
 'train_score': array([ 0.95      ,  0.96666667,  0.96666667,  0.975     ,  0.95833333])}

In [54]:
# cross_val_predict 和 cross_val_score的使用方法是一样的，但是它返回的是一个使用交叉验证以后的输出值(0或1)，而不是评分标准