In [7]:
import sys
sys.path.append("src")

import numpy as np
import torchvision
import matplotlib.pyplot as plt
import math
from tqdm.notebook import tqdm

%matplotlib inline

# Download MNIST dataset
emnist_train = torchvision.datasets.EMNIST('./dataset/', download=True, train=True, split='digits')
emnist_eval = torchvision.datasets.EMNIST('./dataset', download=True, train=False, split='digits')

In [8]:
# Transform data in a matrix of form [batch, dim]
# Create list of correct_labels for train and eval sets

dim = 28*28

n_images_train = len(emnist_train)
n_images_eval = len(emnist_eval)

train_data = np.empty([n_images_train, dim])
train_correct_labels = []

eval_data = np.empty([n_images_eval, dim])
eval_correct_labels = []

for i in range(n_images_train):
    train_data[i] = np.array(emnist_train[i][0]).reshape(1, dim)
    train_correct_labels.append(emnist_train[i][1])

for i in range(n_images_eval):
    eval_data[i] = np.array(emnist_eval[i][0]).reshape(1, dim)
    eval_correct_labels.append(emnist_eval[i][1])

In [9]:
from src.VectorSpace import VectorSpace
from src.VectorSet import VectorSet

def cossine_similarity(vector:np.ndarray, subspace:VectorSpace) -> np.ndarray:
    """
    Returns S = \sum_{i=0}^{r-1} \frac{(x,\phi_i)^2}{\|x\|\|\phi_i\|}
    """
    if vector.ndim > 2:
        raise(AssertionError("Cannot input tensor of ndim > 2"))
    if vector.ndim == 1:
        vector = vector[np.newaxis, :]
    if vector.shape[1] != subspace.dim:
        raise(AssertionError("Vector dimension must be the same as VectorSpace dimension"))       

    vector = vector.astype(subspace.dtype)

    S = np.sum(
            np.divide(
                np.matmul(vector, subspace.A.transpose())**2,
                np.matmul(
                    np.sqrt(
                        np.diag(
                            np.matmul(vector, vector.transpose()
                            )
                        )
                    )[np.newaxis, :].transpose(),
                    np.sqrt(
                        np.diag(
                            np.matmul(subspace.A, subspace.A.transpose())
                        )
                    )[np.newaxis, :]
                )
            ), axis=1
        )
    return S

def scaled_cossine_similarity(vector:np.ndarray, subspace:VectorSpace) -> np.ndarray:
    """
    Returns S = \sum_{i=0}^{r-1} \frac{\sigma_i}{r} \frac{(x,\phi_i)^2}{\|x\|\|\phi_i\|}
    """
    if vector.ndim > 2:
        raise(AssertionError("Cannot input tensor of ndim > 2"))
    if vector.ndim == 1:
        vector = vector[np.newaxis, :]
    if vector.shape[1] != subspace.dim:
        raise(AssertionError("Vector dimension must be the same as VectorSpace dimension"))       

    vector = vector.astype(subspace.dtype)

    S = np.inner(
            np.divide(
                np.matmul(vector, subspace.A.transpose())**2,
                np.matmul(
                    np.sqrt(
                        np.diag(
                            np.matmul(vector, vector.transpose()
                            )
                        )
                    )[np.newaxis, :].transpose(),
                    np.sqrt(
                        np.diag(
                            np.matmul(subspace.A, subspace.A.transpose())
                        )
                    )[np.newaxis, :]
                )
            ),
            (np.array(subspace.singular_values, dtype=subspace.dtype) / np.max(subspace.singular_values))
        )
    return S

In [10]:
# List of min energy for parameter tunning
min_energy_list = np.linspace(0.05, 1, 21)

# Create a VectorSet for all VectorSpaces
set = VectorSet(dim=dim)
set.populate(train_data, train_correct_labels)

for min_energy in tqdm(min_energy_list):
    # Generate Subspaces using pca (svd) and maintain the N biggest eigenvectors, energy(N) > energy(min_energy)
    subset = set.pca(min_energy=min_energy)

    # Create a list of max likelihood using the traditional cossine similarity and the scaled cossine similarity
    max_likelihood_cs = [None]*eval_data.shape[0]
    cs_list = [0]*eval_data.shape[0]
    
    max_likelihood_scs = [None]*eval_data.shape[0]
    scs_list = [0]*eval_data.shape[0]

    # Classify the eval_data
    for subspace in subset:
        cs = cossine_similarity(eval_data, subspace)
        scs = scaled_cossine_similarity(eval_data, subspace)
        for i in range(len(cs)):
            if cs[i] > cs_list[i]: cs_list[i] = cs[i]; max_likelihood_cs[i] = subspace.label
            if scs[i] > scs_list[i]: scs_list[i] = scs[i]; max_likelihood_scs[i] = subspace.label

    correct_class_cs = []
    correct_class_scs = []
    for l1, l2 in zip(max_likelihood_cs, eval_correct_labels):
        correct_class_cs.append(l1 == l2)
    for l1, l2 in zip(max_likelihood_scs, eval_correct_labels):
        correct_class_scs.append(l1 == l2)

    prediction_ratio_cs = correct_class_cs.count(True) / len(correct_class_cs)
    prediction_ratio_scs = correct_class_scs.count(True) / len(correct_class_scs)

    print(prediction_ratio_cs, prediction_ratio_scs)

  0%|          | 0/21 [00:00<?, ?it/s]

0.806325 0.806325
0.82285 0.83385
0.889325 0.863925
0.920825 0.87425
0.928575 0.880725
0.9417 0.883025
0.94915 0.883625
0.9515 0.88265
0.949725 0.881125
0.9421 0.8802
0.934675 0.87905
0.927825 0.878325
0.91785 0.877675
0.911475 0.877225
0.90085 0.877
0.89225 0.87685
0.884125 0.876825
0.869975 0.876725
0.8422 0.876725
0.78065 0.876725
0.11635 0.876725


In [11]:
print(subset[0].singular_values)

[2.985724605015113e-11, 1.659733545780664e-10, 0.10389688948104255, 32.27688344423432, 30.646259009790107, 336.8241528628562, 33.640053678731384, 43.19710449663339, 56.89030714307953, 322.2337463118559, 62.01349094964881, 43.78411494925226, 59.621353123764706, 58.98628029011815, 40.7183466325167, 49.87761362998157, 53.93097999932226, 90.77329916286823, 86.6379019262519, 427.98751006124115, 320.5588093639334, 51.79467489188838, 485.09036373784784, 413.04745015309663, 378.72119071875386, 332.18240227579736, 226.36562815827196, 472.39735713717494, 159.21786607638657, 52.06221383150503, 88.11130854593632, 100.7550850509084, 376.9577703376742, 480.3205663501302, 40.30320799794206, 550.2252978952947, 336.2928947462113, 39.23425512252119, 580.869799863782, 756.9762976679107, 60.75626591116693, 1087.790100663176, 947.4885991014354, 49.51979930064905, 740.2156219371108, 276.3922982322385, 648.082263114549, 848.9321596744004, 96.02378971777745, 161.77787956272923, 808.4037185528069, 842.61976531