## 交叉验证迭代器

In [1]:
# K折交叉验证（KFold）：将样例划分为K份，若K=len(样例)，即为留一交叉验证，K-1份作为训练，剩余样例作为测试集，
# 在有4个样本的数据集上进行2折交叉验证。
# 首先导入numpy模块，别名为np，导入sklearn.model_selection中的KFold类。
import numpy as np
from sklearn.model_selection import KFold

In [2]:
# 创建由"a", "b", "c", "d"组成的列表X，
# 再使用KFold类，传入参数n_splits=2创建2-fold对象kf，
# 最后调用kf中的split方法，
# 对X进行2折交叉验证迭代，
# 并用for循坏遍历2折交叉验证迭代结果，每一个折叠都由两个数组组成：第一个是与训练集相关的，第二个是与测试集相关的。
X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train_index, test_index in kf.split(X):
    print("%s %s" % (train_index, test_index))

[2 3] [0 1]
[0 1] [2 3]


## 重复K-折交叉验证

In [3]:
# 重复K-折交叉验证（RepeatedKFold）：重复 K-折交叉验证n次，每次重复产生不同的分裂，
# KFold 方法采用的是不放回的抽样方法，RepeatedKFold 可以进行有放回的抽取。
# 在有4个样本的数据集上进行重复2次的2折交叉验证。
# 首先，导入numpy模块，别名为np，导入sklearn.model_selection中的RepeatedKFold。
import numpy as np
from sklearn.model_selection import RepeatedKFold

In [4]:
# 然后，使用np.array创建二维数组X，一维数组y，
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])

In [5]:
#最后，使用RepeatedKFold类，传入参数n_splits=2, n_repeats=2, random_state=2652124，
# 创建重复2次的2-fold对象rkf，调用rkf中的split方法，对X进行折叠，
# 并用for循坏遍历折叠结果，每一个折叠都由两个数组组成：第一个是与训练集相关的，第二个是与测试集相关的。
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
for train_index, test_index in rkf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]


## 留一交叉验证

In [6]:
# 留一交叉验证 (LeaveOneOut，LOO)：其实相当于KFold(n_splits=n)，或LeavePOut(p=1)，这里的n是样本数。
# 是 KFold 的特殊情况，它的 k 值等于数据集实例的个数。
# 留一交叉验证的优点是每次训练的训练集都包含除了一个样本之外的所有样本，所以保证了训练集尽可能大。
# 在有2个样本的数据集上进行留一交叉验证。
import numpy as np
from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()
loo.get_n_splits(X)
  
print(loo)  
  
for train_index, test_index in loo.split(X):
   print("TRAIN:", train_index, "TEST:", test_index)
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]
   print(X_train, X_test, y_train, y_test)

2

LeaveOneOut()
TRAIN: [1] TEST: [0]
[[3 4]] [[1 2]] [2] [1]
TRAIN: [0] TEST: [1]
[[1 2]] [[3 4]] [1] [2]


## 留P交叉验证

In [7]:
# 留P交叉验证（LeavePOut）： 是从数据集中随机的选取p个样本作为测试集，剩下的样本作为训练集，重复抽样，直到把所有结果都取到，
# 选定 P个样本作测试集，然后输出所有可能的训练-测试集对。
# 与 LeaveOneOut 和 KFold 不同的地方是，当 P>1 时，当p>1时，测试集数据有重叠。
# 留P交叉验证例子。
import numpy as np  
from sklearn.model_selection import LeavePOut  
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])  
y = np.array([1, 2, 3, 4])  
lpo = LeavePOut(1)  
lpo.get_n_splits(X)  
print(lpo)  
for train_index, test_index in lpo.split(X):  
   print("TRAIN:", train_index, "TEST:", test_index)  
   X_train, X_test = X[train_index], X[test_index]  
   y_train, y_test = y[train_index], y[test_index]

4

LeavePOut(p=1)
TRAIN: [1 2 3] TEST: [0]
TRAIN: [0 2 3] TEST: [1]
TRAIN: [0 1 3] TEST: [2]
TRAIN: [0 1 2] TEST: [3]


## 随机排列交叉抽样

In [8]:
# 随机排列交叉抽样（ShuffleSplit ）：首先将样本随机打乱，然后根据设置参数划分训练数据集与测试数据集，
# 其中参数n_splits：设置重新洗牌和分裂迭代次数，参数test_size=0.25：设置测试集的比例，参数random_state：设置随机抽样的状态。
# 随机排列交叉抽样的例子。
import numpy as np  
from sklearn.model_selection import ShuffleSplit  
X = np.arange(5)  
ss = ShuffleSplit(n_splits=3, test_size=0.25,  
    random_state=0)  
for train_index, test_index in ss.split(X):  
    print("%s %s" % (train_index, test_index))

[1 3 4] [2 0]
[1 4 3] [0 2]
[4 0 2] [1 3]


## 分层k折

In [9]:
# 分层k折（StratifiedKFold）： 是 k-fold 的变种，会返回分层的折叠，
# 即每个小集合中， 各个类别的样例比例大致和完整数据集中相同，通过指定分组，对测试集进行无放回抽样。
# 分层k折交叉抽样的例子。
import numpy as np  
from sklearn.model_selection import StratifiedKFold  
  
X = np.ones(10)  
labels = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]  
skf = StratifiedKFold(n_splits=3)  
for train_index, test_index in skf.split(X, labels):  
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [2 3 6 7 8 9] TEST: [0 1 4 5]
TRAIN: [0 1 3 4 5 8 9] TEST: [2 6 7]
TRAIN: [0 1 2 4 5 6 7] TEST: [3 8 9]


## 分组K折交叉验证

In [10]:
# 分组K折交叉验证（GroupKFold）：先分组，然后把所有组划分为K份，随机取K-1份作为训练，剩余一份作为测试集，这里K 小于分组的组数。
# 分组k折交叉抽样的例子。
import numpy as np  
from sklearn.model_selection import GroupKFold  
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])  
y = np.array([1, 2, 3, 4])  
groups = np.array([0, 0, 2, 2])  
group_kfold = GroupKFold(n_splits=2)  
print(group_kfold)  
for train_index, test_index in group_kfold.split(X, y, groups):  
    print("TRAIN:", train_index, "TEST:", test_index)  
    X_train, X_test = X[train_index], X[test_index]  
    y_train, y_test = y[train_index], y[test_index]  
    print(X_train, X_test, y_train, y_test)

GroupKFold(n_splits=2)
TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [3 4] [1 2]


## 分组留一交叉验证

In [11]:
# 分组留一交叉验证LeaveOneGroupOut：先分组，然后随机取一组做测试集，剩下组做训练数据集，
# 这个是在GroupKFold 的基础上混乱度又减小了。
# 分组k折交叉抽样的例子。
import numpy as np  
from sklearn.model_selection import LeaveOneGroupOut  
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])  
y = np.array([1, 2, 1, 2])  
groups = np.array([1, 1, 2, 2])  
logo = LeaveOneGroupOut()  
logo.get_n_splits(X, y, groups)  
print(logo)  
for train_index, test_index in logo.split(X, y, groups):  
   print("TRAIN:", train_index, "TEST:", test_index)  
   X_train, X_test = X[train_index], X[test_index]  
   y_train, y_test = y[train_index], y[test_index]  
   print(X_train, X_test, y_train, y_test)

2

LeaveOneGroupOut()
TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [1 2] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [1 2]


## 分组留P交叉验证

In [12]:
# 分组留P交叉验证LeavePGroupsOut：先分组，然后随机取P组做测试集，剩下组做训练数据集，
# 分组留P交叉验证抽样的例子。
import numpy as np  
from sklearn.model_selection import LeavePGroupsOut  
X = np.array([[1, 2], [3, 4], [5, 6]])  
y = np.array([1, 2, 1])  
groups = np.array([1, 2, 3])  
lpgo = LeavePGroupsOut(n_groups=2)  
lpgo.get_n_splits(X, y, groups)  
  
lpgo.get_n_splits(groups=groups)  # 'groups' is always required  
  
print(lpgo)  
  
for train_index, test_index in lpgo.split(X, y, groups):  
   print("TRAIN:", train_index, "TEST:", test_index)  
   X_train, X_test = X[train_index], X[test_index]  
   y_train, y_test = y[train_index], y[test_index]  
   print(X_train, X_test, y_train, y_test)

3

3

LeavePGroupsOut(n_groups=2)
TRAIN: [2] TEST: [0 1]
[[5 6]] [[1 2]
 [3 4]] [1] [1 2]
TRAIN: [1] TEST: [0 2]
[[3 4]] [[1 2]
 [5 6]] [2] [1 1]
TRAIN: [0] TEST: [1 2]
[[1 2]] [[3 4]
 [5 6]] [1] [2 1]


## 分组随机排序交叉验证

In [13]:
# 分组随机排序交叉验证GroupShuffleSplit：先分组，然后将组排序随机打乱，最后根据设置参数划分训练数据集与测试数据集，
# 分组随机排序交叉验证的例子。
import numpy as np  
from sklearn.model_selection import GroupShuffleSplit  
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]  
y = ["a", "b", "b", "b", "c", "c", "c", "a"]  
groups = [1, 1, 2, 2, 3, 3, 4, 4]  
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)  
for train_index, test_index in gss.split(X, y, groups=groups):  
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [0 1 2 3] TEST: [4 5 6 7]
TRAIN: [2 3 6 7] TEST: [0 1 4 5]
TRAIN: [2 3 4 5] TEST: [0 1 6 7]
TRAIN: [4 5 6 7] TEST: [0 1 2 3]


# 以上！