In [31]:
import sys
sys.path.append("src")

import numpy as np
import torchvision
import matplotlib.pyplot as plt
from typing import Tuple
from tqdm.notebook import tqdm
np.random.seed(0)

%matplotlib inline

# Download MNIST dataset
emnist_train = torchvision.datasets.EMNIST('./dataset/', download=True, train=True, split='digits')
emnist_eval = torchvision.datasets.EMNIST('./dataset', download=True, train=False, split='digits')

In [32]:
# Transform data in a matrix of form [batch, dim]
# Create list of correct_labels for train and eval sets

dim = 28*28

n_images_train = len(emnist_train)
n_images_eval = len(emnist_eval)

train_data = np.empty([n_images_train, dim])
train_correct_labels = []

eval_data = np.empty([n_images_eval, dim])
eval_correct_labels = []

for i in range(n_images_train):
    train_data[i] = np.array(emnist_train[i][0]).reshape(1, dim)
    train_correct_labels.append(emnist_train[i][1])

for i in range(n_images_eval):
    eval_data[i] = np.array(emnist_eval[i][0]).reshape(1, dim)
    eval_correct_labels.append(emnist_eval[i][1])

In [33]:
from src.VectorSpace import VectorSpace
from src.VectorSet import VectorSet

def cossine_similarity(vector:np.ndarray, subspace:VectorSpace) -> np.ndarray:
    """
    Returns S = \sum_{i=0}^{r-1} \frac{(x,\phi_i)^2}{\|x\|\|\phi_i\|}
    """
    if vector.ndim > 2:
        raise(AssertionError("Cannot input tensor of ndim > 2"))
    if vector.ndim == 1:
        vector = vector[np.newaxis, :]
    if vector.shape[1] != subspace.dim:
        raise(AssertionError("Vector dimension must be the same as VectorSpace dimension"))       

    vector = vector.astype(subspace.dtype)

    S = np.sum(
            np.divide(
                np.matmul(vector, subspace.A.transpose())**2,
                np.matmul(
                    np.sqrt(
                        np.diag(
                            np.matmul(vector, vector.transpose()
                            )
                        )
                    )[np.newaxis, :].transpose(),
                    np.sqrt(
                        np.diag(
                            np.matmul(subspace.A, subspace.A.transpose())
                        )
                    )[np.newaxis, :]
                )
            ), axis=1
        )
    return S

def scaled_cossine_similarity(vector:np.ndarray, subspace:VectorSpace) -> np.ndarray:
    """
    Returns S = \sum_{i=0}^{r-1} \frac{\sigma_i}{\sum{\sigma}} \frac{(x,\phi_i)^2}{\|x\|\|\phi_i\|}
    """
    if vector.ndim > 2:
        raise(AssertionError("Cannot input tensor of ndim > 2"))
    if vector.ndim == 1:
        vector = vector[np.newaxis, :]
    if vector.shape[1] != subspace.dim:
        raise(AssertionError("Vector dimension must be the same as VectorSpace dimension"))       

    vector = vector.astype(subspace.dtype)

    S = np.inner(
            np.divide(
                np.matmul(vector, subspace.A.transpose())**2,
                np.matmul(
                    np.sqrt(
                        np.diag(
                            np.matmul(vector, vector.transpose()
                            )
                        )
                    )[np.newaxis, :].transpose(),
                    np.sqrt(
                        np.diag(
                            np.matmul(subspace.A, subspace.A.transpose())
                        )
                    )[np.newaxis, :]
                )
            ),
            (np.array(subspace.singular_values, dtype=subspace.dtype) / np.max(subspace.singular_values))
        )
    return S

In [34]:
def shuffle(a:np.ndarray, l:list, n:int) -> Tuple[np.ndarray, list]:
    """
    Shuffles the elements of two lists up to a specified index.

    Args:
        a (np.ndarray): An array-like object containing elements of type np.ndarray.
        l (list): A list containing elements of any type.
        n (int): The index up to which the lists should be shuffled.

    Returns:
        Tuple[np.ndarray, list]: A tuple containing two shuffled lists, where the elements
        up to index n have been shuffled.
    """
    combined = list(zip(a, l))
    np.random.shuffle(combined)
    shuffled_list1, shuffled_list2 = zip(*combined)
    return np.array(shuffled_list1[:n]), list(shuffled_list2[:n])

In [35]:
# pilot study

n_train = int(len(train_data) / 100)
n_eval = int(len(eval_data) / 100)
repetitions = 10
min_energy = 0.5

cs_std_list = []
scs_std_list = []

for _ in tqdm(range(repetitions)):
    train_data_bootstrap, train_correct_labels_bootstrap = shuffle(train_data, train_correct_labels, n_train)
    eval_data_bootstrap, eval_correct_labels_bootstrap = shuffle(eval_data, eval_correct_labels, n_eval)

    # Create a VectorSet for all VectorSpaces
    set = VectorSet(dim=dim)
    set.populate(train_data_bootstrap, train_correct_labels_bootstrap)

    # Generate Subspaces using pca (svd) and maintain the N biggest eigenvectors, energy(N) > energy(min_energy)
    subset = set.pca(min_energy=min_energy)

    # Create a list of max likelihood using the traditional cossine similarity and the scaled cossine similarity
    max_likelihood_cs = [None]*eval_data_bootstrap.shape[0]
    cs_list = [0]*eval_data_bootstrap.shape[0]
    
    max_likelihood_scs = [None]*eval_data_bootstrap.shape[0]
    scs_list = [0]*eval_data_bootstrap.shape[0]

    # Classify the eval_data_bootstrap
    for subspace in subset:
        cs = cossine_similarity(eval_data_bootstrap, subspace)
        scs = scaled_cossine_similarity(eval_data_bootstrap, subspace)
        for i in range(len(cs)):
            if cs[i] > cs_list[i]: cs_list[i] = cs[i]; max_likelihood_cs[i] = subspace.label
            if scs[i] > scs_list[i]: scs_list[i] = scs[i]; max_likelihood_scs[i] = subspace.label

    correct_class_cs = []
    correct_class_scs = []
    for l1, l2 in zip(max_likelihood_cs, eval_correct_labels_bootstrap):
        correct_class_cs.append(l1 == l2)
    for l1, l2 in zip(max_likelihood_scs, eval_correct_labels_bootstrap):
        correct_class_scs.append(l1 == l2)

    prediction_ratio_cs = correct_class_cs.count(True) / len(correct_class_cs)
    prediction_ratio_scs = correct_class_scs.count(True) / len(correct_class_scs)
    cs_std_list.append(prediction_ratio_cs)
    scs_std_list.append(prediction_ratio_scs)

print(f"cosine_similarity std: {np.std(cs_std_list)}")
print(f"scaled_cosine_similarity std: {np.std(scs_std_list)}")
print(f"sd <- {np.std(np.subtract(scs_std_list,cs_std_list))}")
print(f"sd.ratio <- {np.std(scs_std_list)/np.std(cs_std_list)}")

  0%|          | 0/10 [00:00<?, ?it/s]

cosine_similarity std: 0.013344006145082498
scaled_cosine_similarity std: 0.017792203348658085
sd <- 0.015459624833740322
sd.ratio <- 1.3333479582677519


In [36]:
# List of min energy for parameter tunning
min_energy_list = np.linspace(0.1, 1, 10)
n_train = int(len(train_data) / 100)
n_eval = int(len(eval_data) / 100)
repetitions = 10

for min_energy in tqdm(min_energy_list):
    for _ in range(repetitions):
        train_data_bootstrap, train_correct_labels_bootstrap = shuffle(train_data, train_correct_labels, n_train)
        eval_data_bootstrap, eval_correct_labels_bootstrap = shuffle(eval_data, eval_correct_labels, n_eval)

        # Create a VectorSet for all VectorSpaces
        set = VectorSet(dim=dim)
        set.populate(train_data_bootstrap, train_correct_labels_bootstrap)

        # Generate Subspaces using pca (svd) and maintain the N biggest eigenvectors, energy(N) > energy(min_energy)
        subset = set.pca(min_energy=min_energy)

        # Create a list of max likelihood using the traditional cossine similarity and the scaled cossine similarity
        max_likelihood_cs = [None]*eval_data_bootstrap.shape[0]
        cs_list = [0]*eval_data_bootstrap.shape[0]
        
        max_likelihood_scs = [None]*eval_data_bootstrap.shape[0]
        scs_list = [0]*eval_data_bootstrap.shape[0]

        # Classify the eval_data_bootstrap
        for subspace in subset:
            cs = cossine_similarity(eval_data_bootstrap, subspace)
            scs = scaled_cossine_similarity(eval_data_bootstrap, subspace)
            for i in range(len(cs)):
                if cs[i] > cs_list[i]: cs_list[i] = cs[i]; max_likelihood_cs[i] = subspace.label
                if scs[i] > scs_list[i]: scs_list[i] = scs[i]; max_likelihood_scs[i] = subspace.label

        correct_class_cs = []
        correct_class_scs = []
        for l1, l2 in zip(max_likelihood_cs, eval_correct_labels_bootstrap):
            correct_class_cs.append(l1 == l2)
        for l1, l2 in zip(max_likelihood_scs, eval_correct_labels_bootstrap):
            correct_class_scs.append(l1 == l2)

        prediction_ratio_cs = correct_class_cs.count(True) / len(correct_class_cs)
        prediction_ratio_scs = correct_class_scs.count(True) / len(correct_class_scs)

        print(f"{min_energy:.2f}, cosine_similarity, {prediction_ratio_cs}")
        print(f"{min_energy:.2f}, scaled_cosine_similarity, {prediction_ratio_scs}")

  0%|          | 0/10 [00:00<?, ?it/s]

0.10, cosine_similarity, 0.805
0.10, scaled_cosine_similarity, 0.805
0.10, cosine_similarity, 0.7675
0.10, scaled_cosine_similarity, 0.7675
0.10, cosine_similarity, 0.8025
0.10, scaled_cosine_similarity, 0.8025
0.10, cosine_similarity, 0.835
0.10, scaled_cosine_similarity, 0.835
0.10, cosine_similarity, 0.8025
0.10, scaled_cosine_similarity, 0.8025
0.10, cosine_similarity, 0.8075
0.10, scaled_cosine_similarity, 0.8075
0.10, cosine_similarity, 0.855
0.10, scaled_cosine_similarity, 0.83
0.10, cosine_similarity, 0.7725
0.10, scaled_cosine_similarity, 0.7725
0.10, cosine_similarity, 0.805
0.10, scaled_cosine_similarity, 0.805
0.10, cosine_similarity, 0.785
0.10, scaled_cosine_similarity, 0.785
0.20, cosine_similarity, 0.91
0.20, scaled_cosine_similarity, 0.8775
0.20, cosine_similarity, 0.91
0.20, scaled_cosine_similarity, 0.86
0.20, cosine_similarity, 0.9025
0.20, scaled_cosine_similarity, 0.8775
0.20, cosine_similarity, 0.8975
0.20, scaled_cosine_similarity, 0.8675
0.20, cosine_similarity