In [2]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import classification_report
from IPython.display import clear_output
import pandas as pd

# Import and format data

In [3]:
# load images
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# vectorize all of the images
x_train = x_train.reshape(-1, 784)
y_train = y_train.reshape(-1,1)
x_test = x_test.reshape(-1, 784)
y_test = y_test.reshape(-1,1)

# convert to black/white
x_train_int = np.array([np.round(image/256) for image in x_train])
x_test_int = np.array([np.round(image/256) for image in x_test])


# Create 10 subsets of a digit or not a digit labels
y_zeros = np.array([1 if y_train[i]==0 else -1 for i in range(len(y_train))])
y_ones  = np.array([1 if y_train[i]==1 else -1 for i in range(len(y_train))])
y_twos  = np.array([1 if y_train[i]==2 else -1 for i in range(len(y_train))])
y_threes= np.array([1 if y_train[i]==3 else -1 for i in range(len(y_train))])
y_fours = np.array([1 if y_train[i]==4 else -1 for i in range(len(y_train))])
y_fives = np.array([1 if y_train[i]==5 else -1 for i in range(len(y_train))])
y_sixes = np.array([1 if y_train[i]==6 else -1 for i in range(len(y_train))])
y_sevens= np.array([1 if y_train[i]==7 else -1 for i in range(len(y_train))])
y_eights= np.array([1 if y_train[i]==8 else -1 for i in range(len(y_train))])
y_nines = np.array([1 if y_train[i]==9 else -1 for i in range(len(y_train))])

# one big array
y_nums = np.array([y_zeros, y_ones, y_twos, y_threes, y_fours, y_fives, y_sixes, y_sevens, y_eights, y_nines])

# SVD decomp

In [None]:
U, s, VT = np.linalg.svd(x_train, full_matrices=False)

## Reconstruct training data using the most significant singular values, and then only first 2 and 3 principal components

In [None]:
keep = 711
U_red = U[:,:keep]
s_red = s[:keep]
VT_red = VT[:keep,:]
x_train_sv1 = U_red @ np.diag(s_red) @ VT_red

keep = 345
U_red = U[:,:keep]
s_red = s[:keep]
VT_red = VT[:keep,:]
x_train_expl90 = U_red @ np.diag(s_red) @ VT_red

x_train2d = x_train @ VT[:2,:].T
x_train3d = x_train @ VT[:3,:].T

x_test2d = x_test @ VT[:2,:].T
x_test3d = x_test @ VT[:3,:].T

# Cross-validation for number of SVs to keep

In [9]:
from sklearn.linear_model import RidgeClassifier
model = RidgeClassifier(solver='lsqr')

num_subsets = 5
test_keeps = np.hstack([1, np.arange(100, 28**2, 100), 28**2])
errors = -1*np.ones([len(test_keeps),num_subsets])

# use each of the 10 subsets for testing
for subset in range(num_subsets):
    
    # hold-out one subset for testing
    te_s = int(60000/num_subsets*subset)
    te_e = int(60000/num_subsets*(subset+1))
    test_idxs = np.r_[te_s:te_e]
    # other subsets are for training
    train_idxs = np.r_[0:te_s, te_e:60000]
    
    # building training and testing subsets
    x_tr = x_train[train_idxs]
    y_tr = y_train[train_idxs]
    x_te = x_train[test_idxs]
    y_te = y_train[test_idxs]
    
    U, s, VT = np.linalg.svd(x_tr, full_matrices=False)
    
    # try keeping different numbers of singular values
    for i, keep in enumerate(test_keeps):
        
        print('Subset ' + str(subset) + ' and ' + str(keep) + ' singular values.')

        U_red = U[:,:keep]
        s_red = s[:keep]
        VT_red = VT[:keep,:]
        x_tr_red = U_red @ np.diag(s_red) @ VT_red

        model.fit(x_tr_red, y_tr.reshape(-1,))
        y_pred = model.predict(x_te)
        error = np.sum(y_te.reshape(-1,) != y_pred.reshape(-1,)) / len(y_te)
        errors[i,subset] = error

        clear_output()

In [10]:
df = pd.DataFrame({
    'SVs kept': test_keeps,
    'Average error rate': np.mean(errors, axis=1)
})
df

Unnamed: 0,SVs kept,Average error rate
0,1,0.823767
1,100,0.15055
2,200,0.150617
3,300,0.151217
4,400,0.151367
5,500,0.150533
6,600,0.1506
7,700,0.150683
8,784,0.150417


# Cross-validation for lambda

In [12]:
num_subsets = 5
test_lambs = np.arange(0.1, 1.1, 0.1)
errors = -1*np.ones([len(test_lambs),num_subsets])

# use each of the 10 subsets for testing
for subset in range(num_subsets):
    
    # hold-out one subset for testing
    te_s = int(60000/num_subsets*subset)
    te_e = int(60000/num_subsets*(subset+1))
    test_idxs = np.r_[te_s:te_e]
    # other subsets are for training
    train_idxs = np.r_[0:te_s, te_e:60000]
    
    # building training and testing subsets
    x_tr = x_train[train_idxs]
    y_tr = y_train[train_idxs]
    x_te = x_train[test_idxs]
    y_te = y_train[test_idxs]
    
    # try keeping different numbers of singular values
    for i, lamb in enumerate(test_lambs):
        
        print('Subset ' + str(subset) + ' and ' + str(lamb) + ' for lambda.')
        
        model = RidgeClassifier(alpha=lamb, solver='lsqr')

        model.fit(x_tr, y_tr.reshape(-1,))
        y_pred = model.predict(x_te)
        error = np.sum(y_te.reshape(-1,) != y_pred.reshape(-1,)) / len(y_te)
        errors[i,subset] = error

        clear_output()

In [13]:
df = pd.DataFrame({
    'Lambda': test_lambs,
    'Average error rate': np.mean(errors, axis=1)
})
df

Unnamed: 0,Lambda,Average error rate
0,0.1,0.150417
1,0.2,0.150417
2,0.3,0.150417
3,0.4,0.150417
4,0.5,0.150417
5,0.6,0.150417
6,0.7,0.150417
7,0.8,0.150417
8,0.9,0.150417
9,1.0,0.150417


In [14]:
U, s, VT = np.linalg.svd(x_train, full_matrices=False)
keep = 100
U_red = U[:,:keep]
s_red = s[:keep]
VT_red = VT[:keep,:]
x_train_red = U_red @ np.diag(s_red) @ VT_red

model = RidgeClassifier(alpha=1, solver='lsqr')
model.fit(x_train_red, y_train.reshape(-1,))

y_pred = model.predict(x_train)
print('Train accuracy:\n',
     classification_report(y_train, y_pred))

y_pred = model.predict(x_test)
print('Train accuracy:\n',
     classification_report(y_test, y_pred))

Train accuracy:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92      5923
           1       0.81      0.97      0.88      6742
           2       0.90      0.79      0.84      5958
           3       0.83      0.83      0.83      6131
           4       0.84      0.88      0.86      5842
           5       0.87      0.68      0.76      5421
           6       0.88      0.93      0.90      5918
           7       0.85      0.88      0.87      6265
           8       0.84      0.76      0.80      5851
           9       0.83      0.81      0.82      5949

    accuracy                           0.85     60000
   macro avg       0.85      0.85      0.85     60000
weighted avg       0.85      0.85      0.85     60000

Train accuracy:
               precision    recall  f1-score   support

           0       0.90      0.95      0.93       980
           1       0.82      0.97      0.89      1135
           2       0.93      0.80      0.86 