# Digits - Cross-validation

### Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Import Dataset

In [None]:
iris = load_iris()
X = iris.data
y = iris.target

## Train-Test Validation

### Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

### Create Model

In [None]:
model = KNeighborsClassifier(n_neighbors=1)

### Train Model

In [None]:
model.fit(X_train, y_train)

### Validate Model

In [None]:
model.score(X_test, y_test)

## Threefold Validation

### Data Splitting

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, random_state=0)

### Create & Train Model

In [None]:
val_scores = []
neighbors = np.arange(1, 15, 2)

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    val_scores.append(knn.score(X_val, y_val))

print("best validation score: ", np.max(val_scores))
best_n_neighbors = neighbors[np.argmax(val_scores)]
print("best number of neighbors: ", best_n_neighbors)

### Retrain & Evaluate Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_trainval, y_trainval)
print("test-set score: ", knn.score(X_test, y_test))

## Cross-Validation

### Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### Cross Validation Score

In [None]:
cross_val_scores = []
neighbors = np.arange(1, 15, 2)

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(knn, X_train, y_train, cv=10)
    cross_val_scores.append(np.mean(scores))

print("best validation score: ", np.max(val_scores))
best_n_neighbors = neighbors[np.argmax(val_scores)]
print("best number of neighbors: ", best_n_neighbors)

### Retrain Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_trainval, y_trainval)
print("test-set score: ", knn.score(X_test, y_test))

## Grid Searches with Cross-Validation for K-Neighbors

### Define Parameter Grid

In [None]:
param_grid = {'n_neighbors': np.arange(1, 15, 2)}

### Create, Train & Test Model GridSearchCV

In [None]:
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid,
                    return_train_score=True, cv=10)

grid.fit(X_train, y_train)

### Get Best Results

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid.score(X_train, y_train)

In [None]:
grid.score(X_test, y_test)

### Visualize Results

In [None]:
results = grid.cv_results_

In [None]:
plt.plot(param_grid['n_neighbors'], results['mean_train_score'], 
         label="train")
plt.plot(param_grid['n_neighbors'], results['mean_test_score'], 
         label="test")
plt.legend()

## Grid Searches with Cross Validation for SVC

### Define Parameter Grid

In [None]:
param_grid = {'C': 10. ** np.arange(-3, 3),
              'gamma' : 10. ** np.arange(-5, 0)}
np.set_printoptions(suppress=True)
print(param_grid)

### Create & Fit Model GridSearchCV

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.predict(X_test)

In [None]:
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
# We extract just the scores

scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(6, 5)

plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(5), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C']);