<!--BOOK_INFORMATION-->
<a href="https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv" target="_blank"><img align="left" src="data/cover.jpg" style="width: 76px; height: 100px; background: white; padding: 1px; border: 1px solid black; margin-right:10px;"></a>
*This notebook contains an excerpt from the upcoming book [Machine Learning for OpenCV](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv) by Michael Beyeler (expected Aug 2017).
The code is released under the [MIT license](https://opensource.org/licenses/MIT),
and is available on [GitHub](https://github.com/mbeyeler/opencv-machine-learning).*

*Note that this excerpt contains only the raw code - the book is rich with additional explanations and illustrations.
If you find this content useful, please consider supporting the work by
[buying the book](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv)!*

<!--NAVIGATION-->
< [None](10.00-Combining-Different-Algorithms-Into-an-Ensemble.ipynb) | [Contents](../README.md) | [None](12.00-Conclusion.ipynb) >

# Understanding Cross-Validation

## Manually implementing cross-validation in OpenCV

In [1]:
from sklearn.datasets import load_iris
import numpy as np
iris = load_iris()
X = iris.data.astype(np.float32)
y = iris.target

In [2]:
from sklearn.model_selection import train_test_split
X_fold1, X_fold2, y_fold1, y_fold2 = train_test_split(
    X, y, random_state=37, train_size=0.5
)

In [3]:
import cv2
knn = cv2.ml.KNearest_create()
knn.setDefaultK(1)

In [4]:
knn.train(X_fold1, cv2.ml.ROW_SAMPLE, y_fold1)
_, y_hat_fold2 = knn.predict(X_fold2)

In [5]:
knn.train(X_fold2, cv2.ml.ROW_SAMPLE, y_fold2)
_, y_hat_fold1 = knn.predict(X_fold1)

In [6]:
from sklearn.metrics import accuracy_score
accuracy_score(y_fold1, y_hat_fold1)

0.92000000000000004

In [7]:
accuracy_score(y_fold2, y_hat_fold2)

0.88

## Automating cross-validation using scikit-learn

In [8]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)

In [9]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
scores

array([ 0.96666667,  0.96666667,  0.93333333,  0.93333333,  1.        ])

In [10]:
scores.mean(), scores.std()

(0.95999999999999996, 0.024944382578492935)

## Implementing leave-one-out cross-validation

In [11]:
from sklearn.model_selection import LeaveOneOut

In [12]:
scores = cross_val_score(model, X, y, cv=LeaveOneOut())

In [13]:
scores

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [14]:
scores.mean(), scores.std()

(0.95999999999999996, 0.19595917942265423)

## Bootstrapping

In [15]:
knn = cv2.ml.KNearest_create()
knn.setDefaultK(1)

In [16]:
idx_boot = np.random.choice(len(X), size=len(X), replace=True)
X_boot = X[idx_boot, :]
y_boot = y[idx_boot]

In [17]:
idx_oob = np.array([x not in idx_boot
                    for x in np.arange(len(X))], dtype=np.bool)
X_oob = X[idx_oob, :]
y_oob = y[idx_oob]

In [18]:
knn.train(X_boot, cv2.ml.ROW_SAMPLE, y_boot)

True

In [19]:
_, y_hat = knn.predict(X_oob)
accuracy_score(y_oob, y_hat)

0.94736842105263153

In [20]:
def yield_bootstrap(model, X, y, n_iter=10000):
    for _ in range(n_iter):
        # train the classifier on bootstrap
        idx_boot = np.random.choice(len(X), size=len(X),
                                    replace=True)
        X_boot = X[idx_boot, :]
        y_boot = y[idx_boot]
        knn.train(X_boot, cv2.ml.ROW_SAMPLE, y_boot)
        
        # test classifier on out-of-bag examples
        idx_oob = np.array([x not in idx_boot
                            for x in np.arange(len(X))],
                           dtype=np.bool)
        X_oob = X[idx_oob, :]
        y_oob = y[idx_oob]
        _, y_hat = knn.predict(X_oob)
        
        # return accuracy
        yield accuracy_score(y_oob, y_hat)

In [21]:
np.random.seed(42)

In [22]:
list(yield_bootstrap(knn, X, y, n_iter=10))

[0.98333333333333328,
 0.93650793650793651,
 0.92452830188679247,
 0.92307692307692313,
 0.94545454545454544,
 0.94736842105263153,
 0.98148148148148151,
 0.96078431372549022,
 0.93220338983050843,
 0.96610169491525422]

In [23]:
acc = list(yield_bootstrap(knn, X, y, n_iter=1000))
np.mean(acc), np.std(acc)

(0.95524155136419198, 0.022040380995646654)

In [24]:
acc = list(yield_bootstrap(knn, X, y, n_iter=10000))
np.mean(acc), np.std(acc)

(0.95501528733009422, 0.021778543317079499)

In [25]:
meow + 2

NameError: name 'meow' is not defined

In [None]:
from sklearn.utils import resample
X_boot, y_boot = resample(X, y, random_state=42, replace=True)

In [None]:
X_boot, y_boot = resample(X, y, random_state=42, replace=True)

In [None]:
knn = cv2.ml.KNearest_create()
knn.setDefaultK(1)
knn.train(X_boot, cv2.ml.ROW_SAMPLE, y_boot)

In [None]:
np.unique(X_boot)

In [None]:
cross_validation.Boo

In [None]:
meow + 2

In [None]:
# from sklearn.model_selection import LeaveOneOut
# scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X)))
# scores

In [None]:
# scores.mean()

## Tuning hyperparamaters with grid search

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=37)

In [None]:
best_acc = 0.0
best_k = 0

In [None]:
for k in range(1, 20):
    knn = cv2.ml.KNearest_create()
    knn.setDefaultK(k)
    knn.train(X_train, cv2.ml.ROW_SAMPLE, y_train)
    _, y_test_hat = knn.predict(X_test)
    acc = accuracy_score(y_test, y_test_hat)
    if acc > best_acc:
        best_acc = acc
        best_k = k

In [None]:
best_acc, best_k

## Doing it again

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=37,
                                                         train_size=0.8)

In [None]:
X_trainval.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval,
                                                      random_state=37)

In [None]:
X_train.shape

In [None]:
best_acc = 0.0
best_k = 0
for k in range(1, 20):
    knn = cv2.ml.KNearest_create()
    knn.setDefaultK(k)
    knn.train(X_train, cv2.ml.ROW_SAMPLE, y_train)
    _, y_valid_hat = knn.predict(X_valid)
    acc = accuracy_score(y_valid, y_valid_hat)
    if acc > best_acc:
        best_acc = acc
        best_k = k
best_acc, best_k

In [None]:
knn = cv2.ml.KNearest_create()
knn.setDefaultK(best_k)
knn.train(X_trainval, cv2.ml.ROW_SAMPLE, y_trainval)
_, y_test_hat = knn.predict(X_test)
accuracy_score(y_test, y_test_hat), best_k

## Grid search with cross-validation

In [None]:
param_grid = {'n_neighbors': range(1, 20)}

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

In [None]:
grid_search.fit(X_trainval, y_trainval)

In [None]:
grid_search.best_score_, grid_search.best_params_

In [None]:
grid_search.score(X_test, y_test)

## Trying to wrap the OpenCV classifier so scikit-learn can understand it

In [None]:
meow + 2

In [None]:
class KnnWrapper(object):
    def __init__(self, k):
        self.knn = cv2.ml.KNearest_create()
        self.k = k
        self.knn.setDefaultK(k)
    
    def fit(self, X, y):
        self.knn.train(X, cv2.ml.ROW_SAMPLE, y)
        return self.knn
    
    def predict(self, X):
        print(X.shape)
        _, y_hat = self.knn.predict(X)
        return y_hat.astype(np.float32)
    
    def score(self, y, y_hat):
        print(y.shape)
        print(y_hat.shape)
        print(y.dtype)
        return accuracy_score(y[:, 0].astype(np.float32), y_hat.astype(np.float32))
    
    def get_params(self, deep=True):
        return {'k': self.k}
        

In [None]:
myknn = KnnWrapper(1)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(myknn, X, y, cv=5)

<!--NAVIGATION-->
< [None](10.00-Combining-Different-Algorithms-Into-an-Ensemble.ipynb) | [Contents](../README.md) | [None](12.00-Conclusion.ipynb) >