In [1]:
from sklearn import datasets, linear_model, cross_validation, grid_search
import numpy as np
digits = datasets.load_digits()
x = digits.data[:1000]
y = digits.target[:1000]

In [4]:
print x.shape, y.shape

(1000, 64) (1000,)


K-Fold Cross Validation is used to validate your model through generating different combinations of the data you already have. 

For example, if you have 100 samples, you can train your model on the first 90, and test on the last 10. Then you could train on samples 1-80 & 90-100, and test on samples 80-90. Then repeat. This way, you get different combinations of train/test data, essentially giving you ‘more’ data for validation from your original data. 

The number of times you ‘switch around’ the train/test data is the number of folds. Therefore, 3-Fold Cross Validation will yield 3 sets of train/test data, 5-Fold Cross Validation will yield 5 sets, and so forth.

In [14]:
kf_total = cross_validation.KFold(len(x), n_folds=10, shuffle=True, random_state=4)

print "The two generated sets of indices used for testing and validation:\n"
for train, test in kf_total:
    print train[0:10], "... len=", len(train),'\n', test[0:10], "... len=", len(test), '\n\n'

The two generated sets of indices used for testing and validation:

[0 1 2 3 4 5 6 7 8 9] ... len= 900 
[17 40 47 63 66 72 76 88 89 92] ... len= 100 


[0 1 2 3 4 5 6 7 8 9] ... len= 900 
[ 11  35  55  65  67  70  74  77  98 102] ... len= 100 


[0 1 2 3 4 5 6 7 8 9] ... len= 900 
[ 13  33  34  36  42  50  53  62  90 106] ... len= 100 


[ 0  1  2  3  4  5  6  7  9 10] ... len= 900 
[  8  31  45  80  83  97 101 108 117 135] ... len= 100 


[ 0  1  2  4  5  6  7  8  9 10] ... len= 900 
[ 3 14 15 18 24 25 61 68 71 75] ... len= 100 


[ 0  2  3  5  7  8  9 10 11 12] ... len= 900 
[ 1  4  6 16 20 43 46 57 64 79] ... len= 100 


[0 1 2 3 4 5 6 7 8 9] ... len= 900 
[12 21 28 38 39 48 60 69 82 84] ... len= 100 


[0 1 2 3 4 5 6 7 8 9] ... len= 900 
[ 19  23  26  37  41  81 111 116 118 121] ... len= 100 


[ 0  1  3  4  6  8 11 12 13 14] ... len= 900 
[ 2  5  7  9 10 22 27 29 32 49] ... len= 100 


[ 1  2  3  4  5  6  7  8  9 10] ... len= 900 
[  0  30  44  51  56  58  59  73  95 107] ... len=

In [None]:
# Create a model
lr = linear_model.LogisticRegression()

In [15]:
# Way 1: Run on the ten folds and capture the resulting scores in a list
[lr.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices])
for train_indices, test_indices in kf_total]

[0.95999999999999996,
 0.95999999999999996,
 0.98999999999999999,
 0.96999999999999997,
 0.97999999999999998,
 0.96999999999999997,
 0.93999999999999995,
 0.94999999999999996,
 0.94999999999999996,
 0.96999999999999997]

In [16]:
# Way 2: Use sklearn utility
cross_validation.cross_val_score(lr, x, y, cv=kf_total, n_jobs = 1)


array([ 0.96,  0.96,  0.99,  0.97,  0.98,  0.97,  0.94,  0.95,  0.95,  0.97])

In [18]:
# We can use other metrics 
cross_validation.cross_val_score(lr, x, y, cv=kf_total, n_jobs = 1, scoring='mean_squared_error')

# Note: sklearn flips the sign and return -ve values for MSE.  Purpose: bigger is better
# See https://github.com/scikit-learn/scikit-learn/issues/2439

array([-1.19, -1.03, -0.36, -0.53, -0.53, -0.57, -1.54, -0.71, -1.26, -1.14])