# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [11]:
X_small = X[0:5000]
y_small = y[0:5000]

In [12]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from scipy.stats import uniform

svc = SVC(kernel='rbf')
svc.fit(X_small,y_small)
pred = svc.predict(X_small)
accuracy_score(y_small, pred)

0.9872

In [13]:
from sklearn.model_selection import RandomizedSearchCV
svc_opt = SVC(kernel='rbf')

distributions = dict(C=uniform(loc=0, scale=4),
                    gamma=['auto', 'scale'])
clf = RandomizedSearchCV(svc_opt, distributions, random_state=0)
search = clf.fit(X_small, y_small)
search.best_params_


{'C': 3.3770629943240693, 'gamma': 'scale'}

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train.shape + X_test.shape

(8000, 784, 2000, 784)

In [41]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,), solver='sgd', activation='tanh'))

mlp.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(16,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=None, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

In [42]:
pred = mlp.predict(X_test)
accuracy_score(y_test, pred)


0.9195

In [44]:
mlp_1 = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,32,16), solver='sgd', activation='tanh'))
mlp_1.fit(X_train, y_train)
mlp_2 = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,32,16), solver='sgd', activation='tanh'))
mlp_2.fit(X_train, y_train)
mlp_3 = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,32,16), solver='sgd', activation='tanh'))
mlp_3.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(16, 16, 16),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=None, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

In [47]:
pred_1 = mlp_1.predict(X_test)
pred_2 = mlp_2.predict(X_test)
pred_3 = mlp_3.predict(X_test)

acc_1 = accuracy_score(y_test, pred_1)
acc_2 = accuracy_score(y_test, pred_2)
acc_3 = accuracy_score(y_test, pred_3)

print(acc_1,acc_2,acc_3)

0.908 0.9045 0.893


In [50]:
mlp_alpha = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,32,16), solver='sgd', activation='tanh', alpha=0.001))
mlp_alpha.fit(X_train, y_train)
pred_alpha = mlp_alpha.predict(X_test)
accuracy_score(y_test, pred_alpha)



0.9175

In [51]:
mlp_lr = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(16,32,16), solver='sgd', activation='tanh', learning_rate='invscaling'))
mlp_lr.fit(X_train, y_train)
pred_lr = mlp_lr.predict(X_test)
accuracy_score(y_test, pred_lr)



0.3805