# Cross-validataion

# method 1
![alt text](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

# method 2
![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/K-fold_cross_validation_EN.svg/1200px-K-fold_cross_validation_EN.svg.png)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.4, 
                                                    random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(90, 4) (90,)
(60, 4) (60,)


In [None]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

## cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1)

scores = cross_val_score(clf, X, y, cv=5)
print('score', scores)

# mean score
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

score [0.96666667 1.         0.96666667 0.96666667 1.        ]
Accuracy: 0.98 (+/- 0.03)


In [None]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
print('score', scores)

# mean score
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

score [0.97777778 0.97777778 1.         0.95555556 1.        ]
Accuracy: 0.98 (+/- 0.03)


# Example

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]

In [None]:
print(X.shape)
print(X)

(150, 10)
[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.05637009 -0.04464164  0.09295276 ...  0.02545259  0.02605609
   0.04034337]
 [-0.06000263  0.05068012  0.01535029 ... -0.00259226 -0.03075121
  -0.0010777 ]
 [-0.04910502  0.05068012 -0.00512814 ...  0.07120998  0.06123791
  -0.03835666]]


In [None]:
print(y.shape)
print(y)

(150,)
[151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126.]


In [None]:
from sklearn import linear_model

lasso = linear_model.Lasso()

# single metric evaluation using cross_validate
cv_results = cross_validate(lasso, X, y, cv=3)
print(cv_results['test_score'])

# mean score
print('Accuracy: %0.2f (+/- %0.2f)' % (cv_results['test_score'].mean(), 
                                       cv_results['test_score'].std() * 2))

[0.33150734 0.08022311 0.03531764]
Accuracy: 0.15 (+/- 0.26)


In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(lasso, X, y, cv=cv)
print('score', scores)

# mean score
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

score [0.30553672 0.22064577 0.17962466 0.25542505 0.33841705]
Accuracy: 0.26 (+/- 0.11)
