<!--BOOK_INFORMATION-->
<a href="https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv" target="_blank"><img align="left" src="data/cover.jpg" style="width: 76px; height: 100px; background: white; padding: 1px; border: 1px solid black; margin-right:10px;"></a>
*This notebook contains an excerpt from the book [Machine Learning for OpenCV](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv) by Michael Beyeler.
The code is released under the [MIT license](https://opensource.org/licenses/MIT),
and is available on [GitHub](https://github.com/mbeyeler/opencv-machine-learning).*

*Note that this excerpt contains only the raw code - the book is rich with additional explanations and illustrations.
If you find this content useful, please consider supporting the work by
[buying the book](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv)!*

<!--NAVIGATION-->
< [Understanding Cross-Validation](11.01-Understanding-Cross-Validation-Bootstrapping-and-McNemar's-Test.ipynb) | [Contents](../README.md) | [Chaining Algorithms Together to Form a Pipeline](11.03-Chaining-Algorithms-Together-to-Form-a-Pipeline.ipynb) >

# Tuning Hyperparameters with Grid Search

In [1]:
from sklearn.datasets import load_iris
import numpy as np
iris = load_iris()
X = iris.data.astype(np.float32)
y = iris.target

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=37
)

In [3]:
best_acc = 0
best_k = 0

In [4]:
import cv2
from sklearn.metrics import accuracy_score
for k in range(1, 20):
    knn = cv2.ml.KNearest_create()
    knn.setDefaultK(k)
    knn.train(X_train, cv2.ml.ROW_SAMPLE, y_train)
    _, y_test_hat = knn.predict(X_test)
    acc = accuracy_score(y_test, y_test_hat)
    if acc > best_acc:
        best_acc = acc
        best_k = k

In [5]:
best_acc, best_k

(0.97368421052631582, 1)

## Avoiding to overfit the validation set

In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, random_state=37
)

In [7]:
X_trainval.shape

(112, 4)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainval, y_trainval, random_state=37
)

In [9]:
X_train.shape

(84, 4)

In [10]:
best_acc = 0.0
best_k = 0
for k in range(1, 20):
    knn = cv2.ml.KNearest_create()
    knn.setDefaultK(k)
    knn.train(X_train, cv2.ml.ROW_SAMPLE, y_train)
    _, y_valid_hat = knn.predict(X_valid)
    acc = accuracy_score(y_valid, y_valid_hat)
    if acc >= best_acc:
        best_acc = acc
        best_k = k
best_acc, best_k

(1.0, 7)

In [11]:
knn = cv2.ml.KNearest_create()
knn.setDefaultK(best_k)
knn.train(X_trainval, cv2.ml.ROW_SAMPLE, y_trainval)
_, y_test_hat = knn.predict(X_test)
accuracy_score(y_test, y_test_hat), best_k

(0.94736842105263153, 7)

## Grid search with cross-validation

In [12]:
param_grid = {'n_neighbors': range(1, 20)}

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

In [14]:
grid_search.fit(X_trainval, y_trainval)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': range(1, 20)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [15]:
grid_search.best_score_, grid_search.best_params_

(0.9642857142857143, {'n_neighbors': 3})

In [16]:
grid_search.score(X_test, y_test)

0.97368421052631582

In [18]:
def mcnemar(b, c):
    """Compute McNemar's test using the "mid-p" variant suggested by:
    
    M.W. Fagerland, S. Lydersen, P. Laake. 2013. The McNemar test for 
    binary matched-pairs data: Mid-p and asymptotic are better than exact 
    conditional. BMC Medical Research Methodology 13: 91.
    
    Parameters
    ----------
    b : int
        Number of observations correctly labeled by the first, but not
        the second system
    c : int
        Number of observations correctly labeled by the second, but not
        the first system
        
    Notes
    -----
    https://gist.github.com/kylebgorman/c8b3fb31c1552ecbaafb
    """
    n = b + c
    x = min(b, c)
    dist = binom(n, 0.5)
    p = 2.0 * dist.cdf(x)
    midp = p - dist.pmf(x)
    return midp

## Trying to wrap the OpenCV classifier so scikit-learn can understand it

In [17]:
meow + 2

NameError: name 'meow' is not defined

In [None]:
class KnnWrapper(object):
    def __init__(self, k):
        self.knn = cv2.ml.KNearest_create()
        self.k = k
        self.knn.setDefaultK(k)
    
    def fit(self, X, y):
        self.knn.train(X, cv2.ml.ROW_SAMPLE, y)
        return self.knn
    
    def predict(self, X):
        print(X.shape)
        _, y_hat = self.knn.predict(X)
        return y_hat.astype(np.float32)
    
    def score(self, y, y_hat):
        print(y.shape)
        print(y_hat.shape)
        print(y.dtype)
        return accuracy_score(y[:, 0].astype(np.float32), y_hat.astype(np.float32))
    
    def get_params(self, deep=True):
        return {'k': self.k}
        

In [None]:
myknn = KnnWrapper(1)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(myknn, X, y, cv=5)

<!--NAVIGATION-->
< [Understanding Cross-Validation](11.01-Understanding-Cross-Validation-Bootstrapping-and-McNemar's-Test.ipynb) | [Contents](../README.md) | [Chaining Algorithms Together to Form a Pipeline](11.03-Chaining-Algorithms-Together-to-Form-a-Pipeline.ipynb) >