# Model selection and overfitting

## Problem 1: classification

In this exercise, we will explore model selection for classifying SANS patterns using the k-nearest neighbor algorithm.


### Import Python modules

In [None]:
import os  
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, f1_score, accuracy_score

import pickle

%matplotlib inline

### Load data

Load SANS patterns.

In [None]:
datapath="data/test3_short.pickle"
with open(datapath, 'rb') as f:
    data = pickle.load(f)

### Set up labels for classification and visualization

In [None]:
labels = list(set(data["y"]))
labdict = dict.fromkeys(labels)
for idx, k in enumerate(labels):
    labdict[k] = idx
Yvals_num = [labdict[k] for k in data["y"]]

print(labdict)

### Preprocess data

In [None]:
%%time
# data preprocessing: flatten
for idx, arr in enumerate(data["sas"]):
    data["sas"][idx] = arr.flatten()

scaler = MinMaxScaler()
X_sas = scaler.fit_transform(np.array(data["sas"]))

### Visualize SANS patterns

In [None]:
fig, ax = plt.subplots(6, 7, figsize = (12,7))
for i,k in enumerate(np.arange(1, len(X_sas), 143)):
    ix,iy = (i//7)%6, i%7
    h = ax[ix,iy]
    h.imshow( X_sas[k].reshape((128,128)) )
    h.set_xticks([])
    if iy == 0:
        h.set_yticks([64])
        ii = Yvals_num[k]
        iii = labels[ii]
        h.set_yticklabels([iii])
    else:
        h.set_yticks([])
    

### Select the learning algorithms and the corresponding hyperparameters

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "MLP",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.5), # SVC(kernel="linear", C=0.025),
    SVC(gamma='scale', C=100.0), # SVC(gamma='scale', C=1.0),
    DecisionTreeClassifier(max_depth=20, max_features='sqrt'),
    RandomForestClassifier(max_depth=20, n_estimators=100, max_features='sqrt', n_jobs=4),
    MLPClassifier(hidden_layer_sizes=(20,20), alpha=0.01, learning_rate='adaptive', max_iter=1000),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

### Train the classifier(s)

In [None]:
%%time
print('Predict full SAS data.\n')

X_train, X_ntrain, y_train, y_ntrain = train_test_split(X_sas, Yvals_num, test_size=0.4, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_ntrain, y_ntrain, test_size=0.5, random_state=3)

ncl = 1
for name, clf in zip(names[:ncl], classifiers[:ncl]):
    print(name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_sas)
    train_score = accuracy_score(y_train, clf.predict(X_train))
    test_score = accuracy_score(y_test, clf.predict(X_test))
    val_score = accuracy_score(y_val, clf.predict(X_val))
    
    print('Train accuracy: %g'%(train_score))
    print('Test accuracy: %g'%(test_score))
    print('Validation accuracy: %g'%(val_score))
    print (classification_report(clf.predict(X_test), y_test))


**Your task here**. Explore the dependence of the training and test error on the parameter k of the k-nearest neighbor algorithm.