# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

In [6]:
#reduce the data for the first test the the computing ist faster
X= X[:3000,:]
y= y[:3000]

In [7]:
X.shape

(3000, 784)

In [8]:
y.shape

(3000,)

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [9]:
#first split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [10]:
%%time
#RBF Kernel
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf_svm = SVC(C = 3.0, kernel='rbf', gamma = 'auto', max_iter=40, random_state=42)
clf_svm.fit(X_train, y_train)

clf_svm.score(X_test,y_test)



Wall time: 3.01 s


0.16

In [11]:
%%time
# random search with cross validation for C and gamma
from sklearn.model_selection import RandomizedSearchCV

# build a classifier
clf_svm2 = SVC()

# specify parameters and distributions to sample from
param_dist = {'C': np.linspace(0,50 ,num=10),
              'gamma':np.linspace(0.1,10,num=10)}

param_2= dict(C=[0.1,1,2,7,20,50],gamma=[0.1,0.5,1,10,'scale']) #create search space

# run randomized search, cv= crossvalidation
random_search = RandomizedSearchCV(estimator=clf_svm2, param_distributions=param_2,
                                   n_iter=40 , cv=5, n_jobs=4)
random_search.fit(X_train,y_train)





Wall time: 7min 36s


RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=40, n_jobs=4,
                   param_distributions={'C': [0.1, 1, 2, 7, 20, 50],
                                        'gamma': [0.1, 0.5, 1, 10, 'scale']})

In [12]:
import pandas as pd
random_search_results = pd.DataFrame(random_search.cv_results_,dtype='float', columns = ['param_C','param_gamma',
                                                                      'split0_test_score','split1_test_score','split2_test_score','split3_test_score',
                                                                      'mean_test_score','rank_test_score'])
random_search_results

Unnamed: 0,param_C,param_gamma,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,rank_test_score
0,0.1,0.1,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
1,0.1,0.5,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
2,0.1,1,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
3,0.1,10,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
4,0.1,scale,0.852083,0.870833,0.86875,0.866667,0.868333,6.0
5,1.0,0.1,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
6,1.0,0.5,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
7,1.0,1,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
8,1.0,10,0.1125,0.110417,0.1125,0.1125,0.112083,7.0
9,1.0,scale,0.914583,0.929167,0.929167,0.910417,0.9275,5.0


In [13]:
#get best parameters
random_search.best_score_

0.9375000000000002

In [14]:
#get best parameters
random_search.best_params_

{'gamma': 'scale', 'C': 7}

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [15]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [16]:
#first split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [17]:
# Create a Pipeline with Scaler and MLP
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

#create the pipeline
#Change the number of Neurons, layers 3
clf_pipe10 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,16),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe11 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,32),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe12 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe13 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,128),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe14 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,256),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe15 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,1024),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe16 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,2048),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))

#Change the Layers
clf_pipe20 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(1,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe21 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(2,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe22 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe23 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(10,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe24 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(100,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))

#Change the learning_rate
clf_pipe30 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe31 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='invscaling', activation ='tanh',solver='sgd'))
clf_pipe32 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='adaptive', activation ='tanh',solver='sgd'))

#changing alpha
clf_pipe40 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe41 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.0005,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe42 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.001,learning_rate='constant', activation ='tanh',solver='sgd'))
clf_pipe43 = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(3,64),alpha=0.01,learning_rate='constant', activation ='tanh',solver='sgd'))

In [18]:
%%time
# Change Neurons
clf_pipe10.fit(X_train,y_train)
clf_pipe11.fit(X_train,y_train)
clf_pipe12.fit(X_train,y_train)
clf_pipe13.fit(X_train,y_train)
clf_pipe14.fit(X_train,y_train)
clf_pipe15.fit(X_train,y_train)
clf_pipe16.fit(X_train,y_train)

# Change Layers
clf_pipe20.fit(X_train,y_train)
clf_pipe21.fit(X_train,y_train)
clf_pipe22.fit(X_train,y_train)
clf_pipe23.fit(X_train,y_train)
clf_pipe24.fit(X_train,y_train)

#change learning_rate
clf_pipe30.fit(X_train,y_train)
clf_pipe31.fit(X_train,y_train)
clf_pipe32.fit(X_train,y_train)

# Change alpha
clf_pipe40.fit(X_train,y_train)
clf_pipe41.fit(X_train,y_train)
clf_pipe42.fit(X_train,y_train)
clf_pipe43.fit(X_train,y_train)




Wall time: 3min 59s




Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.01,
                               hidden_layer_sizes=(3, 64), solver='sgd'))])

In [19]:
print("3 Layer, Changing Neurons:")
print(" 16 Neurons:   ",clf_pipe10.score(X_test,y_test))
print(" 32 Neurons:   ",clf_pipe11.score(X_test,y_test))
print(" 64 Neurons:   ",clf_pipe12.score(X_test,y_test))
print(" 128 Neurons:  ",clf_pipe13.score(X_test,y_test))
print(" 256 Neurons:  ",clf_pipe14.score(X_test,y_test))
print(" 1024 Neurons: ",clf_pipe15.score(X_test,y_test))
print(" 2048 Neurons: ",clf_pipe16.score(X_test,y_test))

3 Layer, Changing Neurons:
 16 Neurons:    0.66
 32 Neurons:    0.6433333333333333
 64 Neurons:    0.68
 128 Neurons:   0.7133333333333334
 256 Neurons:   0.71
 1024 Neurons:  0.6933333333333334
 2048 Neurons:  0.67


In [21]:
print("64 Neurons changing Layers:")
print(" 1 Layer:   ",clf_pipe20.score(X_test,y_test))
print(" 2 Layer:   ",clf_pipe21.score(X_test,y_test))
print(" 3 Layer:   ",clf_pipe22.score(X_test,y_test))
print(" 10 Layer:  ",clf_pipe23.score(X_test,y_test))
print(" 100 Layer: ",clf_pipe24.score(X_test,y_test))

64 Neurons changing Layers:
 1 Layer:    0.34
 2 Layer:    0.4633333333333333
 3 Layer:    0.69
 10 Layer:   0.86
 100 Layer:  0.8966666666666666


In [22]:
print("3 Layer, 64 Neurons, Changing learning_rate:")
print(" constant:     ",clf_pipe30.score(X_test,y_test))
print(" invscaling:   ",clf_pipe31.score(X_test,y_test))
print(" adaptive:     ",clf_pipe32.score(X_test,y_test))


3 Layer, 64 Neurons, Changing learning_rate:
 constant:      0.6933333333333334
 invscaling:    0.11333333333333333
 adaptive:      0.7066666666666667


In [23]:
print("3 Layer, 64 Neurons, Changing alpha:")
print("alpha= 0.0001: ",clf_pipe20.score(X_test,y_test))
print("alpha= 0.0005: ",clf_pipe21.score(X_test,y_test))
print("alpha= 0.001:  ",clf_pipe22.score(X_test,y_test))
print("alpha= 0.01:    ",clf_pipe23.score(X_test,y_test))

3 Layer, 64 Neurons, Changing alpha:
alpha= 0.0001:  0.34
alpha= 0.0005:  0.4633333333333333
alpha= 0.001:   0.69
alpha= 0.01:     0.86


In [24]:
%%time
#Maybe good combination
clf_pipe = make_pipeline(StandardScaler(),MLPClassifier(hidden_layer_sizes=(100,64),alpha=0.01,learning_rate='adaptive', activation ='tanh',solver='sgd'))
clf_pipe.fit(X_train,y_train)
clf_pipe.score(X_test,y_test)

Wall time: 19.4 s




0.9033333333333333