In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# First make sure to install timm
# !pip install timm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from tqdm import tqdm
import os
import numpy as np
import timm
import torchvision
from sklearn.metrics import classification_report
from pprint import pprint

## Device Selection

We will opt for gpu, if it's available

In [26]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f'Using device: {device}')

Using device: cuda


## Download Dataset

We can use CIFAR10 or CIFAR100 as our dataset. Since both are really common datasets we will use the `torchvision.datasets` class to load them.

To select the dataset, modify the `use_CIFAR10` boolean variable accordingly.

We also apply some basic __preprocessing__:

1. Normalize the dataset by subtracting the _mean_ and dividing with the _std_

In [27]:

def compute_mean_std(dataset):
    """Compute mean and standard deviation of a dataset dynamically."""
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False)
    
    mean = torch.zeros(3)
    std = torch.zeros(3)
    
    for images, _ in loader:
        mean += images.mean(dim=[0, 2, 3])
        std += images.std(dim=[0, 2, 3])
    
    mean /= len(loader)
    std /= len(loader)
    
    return mean, std



In [28]:
dataset_directory = "../../../cifar-10-batches-py-for-pytorch"

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Resize((224, 224))
])

cifar10_dataset = torchvision.datasets.CIFAR10(root=dataset_directory, train=True, download=True, transform=transform)
mean_cifar100, std_cifar100 = compute_mean_std(cifar10_dataset)

In [29]:
print(f'Mean: {mean_cifar100}')
print(f'Std: {std_cifar100}')

Mean: tensor([0.4915, 0.4822, 0.4466])
Std: tensor([0.2405, 0.2371, 0.2555])


In [30]:
import torch
import torchvision
from torch.utils.data import DataLoader, random_split

# Set random seed for reproducibility
torch.manual_seed(42)
import random
random.seed(42)

use_CIFAR10 = True

# Get the precomputed mean and std
if use_CIFAR10:
    mean = (0.4914, 0.4822, 0.4465)
    std = (0.2023, 0.1994, 0.2010)
else:
    mean = (0.5070, 0.4865, 0.4408)
    std = (0.2613, 0.2503, 0.2703)

# Dataset directory
dataset_directory = "src/development/output_data"

# Transformations
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Resize((224, 224)),
    torchvision.transforms.Normalize(mean, std)
])

# Download the dataset
if use_CIFAR10:
    cifar_dataset = torchvision.datasets.CIFAR10(root=dataset_directory, train=True, download=True, transform=transform)
    test_dataset = torchvision.datasets.CIFAR10(root=dataset_directory, train=False, download=True, transform=transform)
else:
    cifar_dataset = torchvision.datasets.CIFAR100(root=dataset_directory, train=True, download=True, transform=transform)
    test_dataset = torchvision.datasets.CIFAR100(root=dataset_directory, train=False, download=True, transform=transform)

print(f'Dataset downloaded. Total images: {len(cifar_dataset)}')

# Split the dataset into train / validation sets
train_size = int(0.9 * len(cifar_dataset))
val_size = len(cifar_dataset) - train_size

# Use a fixed seed for reproducibility of random split
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset = random_split(cifar_dataset, [train_size, val_size], generator=generator)

# Set the batch size to 64
batch_size = 64

# Create DataLoaders with shuffle enabled for train and val, and fixed seed for test
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=generator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, generator=generator)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, generator=generator)

# Sanity check: print first few indices of train, val, and test datasets to ensure reproducibility
print(f"First 5 indices of train dataset: {train_dataset.indices[:5]}")
print(f"First 5 indices of validation dataset: {val_dataset.indices[:5]}")
print(f"First 5 indices of test dataset: {list(test_dataset.targets[:5])}")

# Print the sizes
print(f'Train images: {train_size}')
print(f'Validation images: {val_size}')
print(f'Test images: {len(test_dataset)}')


Dataset downloaded. Total images: 50000
First 5 indices of train dataset: [37542, 44491, 216, 43688, 41558]
First 5 indices of validation dataset: [17408, 40845, 3378, 22188, 44921]
First 5 indices of test dataset: [3, 8, 8, 0, 6]
Train images: 45000
Validation images: 5000
Test images: 10000


In [7]:
# If we have .pkl files use those
import os.path
import pickle

if os.path.isfile('./train_loader.pkl'):

    print('Will load existing loaders')

    with open('./train_loader.pkl', 'rb') as fp:
        train_loader = pickle.load(fp)

    with open('./test_loader.pkl', 'rb') as fp:
        test_loader = pickle.load(fp)

    with open('./val_loader.pkl', 'rb') as fp:
        val_loader = pickle.load(fp)


## Useful Methods

Bellow we have created some methods to make the code simpler.

### Most Common List Item

In [31]:
from collections import Counter

def most_common(lst):
    data = Counter(lst)
    return max(lst, key=data.get)

### Get CLS Token

By taking a look at the [documentation](https://huggingface.co/docs/timm/en/feature_extraction) for the timm library and specifically the __Feature Extraction__ section we can see that in order to get the __last hidden state__ of the model we have to use the `forward_features` method.

Specifically, this method returns the patch embeddings at the last hidden state, __before pooling is applied__. The return vector is of shape

```
(batch_size, num_patches + 1, hidden_size)
```

The __CLS Token__ is by design the __first of the patch embeddings__

For example to get the CLS Token of the first image in the batch we would have to do:

```py
model_output[0, 0, :]
```

In [32]:
def get_cls_token(model: timm.models.vision_transformer.VisionTransformer, images: torch.Tensor) -> np.array:
    # get the last hidden state
    output = model.forward_features(images)

    # for each image get the cls token
    # make sure to convert each tensor to numpy
    cls_tokens = output[:, 0, :].cpu().numpy()
    
    return cls_tokens

### Get Patch Embeddings

By taking a look at the [documentation](https://huggingface.co/docs/timm/en/feature_extraction) for the timm library and specifically the __Feature Extraction__ section we can see that in order to get the __last hidden state__ of the model we have to use the `forward_features` method.

Specifically, this method returns the patch embeddings at the last hidden state, __before pooling is applied__. The return vector is of shape

```
(batch_size, num_patches + 1, hidden_size)
```

We want to get all the patch embeddings with an option to also include the cls.

As such, to get everything except the cls we will have to do
```py
cls_tokens = output[:, 1:, :].cpu().numpy()
```

In [33]:
def get_patch_embeddings(model: timm.models.vision_transformer.VisionTransformer, images: torch.Tensor, include_cls: bool = False) -> np.array:
    # get the last hidden state
    output = model.forward_features(images)

    # If we dont want the cls we have to skip the first line
    if not include_cls:
        return output[:, 1:, :].cpu().numpy()
    # else return the whole hidden state
    else:
        return output[:, :, :]

### Extract CLS Tokens for all images in a DataLoader

This method uses the `get_cls_token` method above to extract all cls tokens from a given dataloader.

For each image we will also need:
    
    1. it's original position (?)
    2. the label

The method returns __a dictionary__ with:
1. __key__: The original position of the image
2. __value__: A dictionary with `cls_token` and `label` keys

In [34]:
def get_dataset_cls_tokens(model: timm.models.vision_transformer.VisionTransformer, loader: torch.utils.data.dataloader.DataLoader
):

    cls_tokens = []
    cls_labels = []

    for idx, (images, labels) in tqdm(enumerate(loader), desc="Calculating CLS Tokens", total=len(loader)):
        # move to device
        images = images.to(device)
        labels = labels.to(device)
        # this returns a numpy array with shape
        # (batch_size, hidden_size)
        tokens = get_cls_token(model=model, images=images)

        # For each image in the batch
        for idx in range(tokens.shape[0]):
            cls_tokens.append(tokens[idx, :])
            cls_labels.append(labels[idx])

    return cls_tokens, cls_labels

### Extract Patch Embeddings for all images in DataLoader

This method uses the `get_patch_embeddings` method above to extract all patch embeddings from a given dataloder.

__NOTES__:
- Each image has multiple patch embeddings.
- Each patch embedding will have the shame label as it's source image.

In [35]:
def get_dataset_patch_embeddings(model: timm.models.vision_transformer.VisionTransformer, loader: torch.utils.data.dataloader.DataLoader, include_cls: bool = False, is_test: bool = False
):

    patch_embeddings = []
    patch_labels = []

    for cnt, (images, labels) in tqdm(enumerate(loader), desc="Calculating Patch Embeddings", total=len(loader)):

        # move to device
        images = images.to(device)
        labels = labels.to(device)

        # this returns a numpy array with shape
        # (batch_size, num_patches, hidden_size)
        embs = get_patch_embeddings(model=model, images=images, include_cls=include_cls)

        # For each image in the batch
        for idx in range(embs.shape[0]):
            # If we are not calculating for the test, we simply need a label for each patch
            if not is_test:
                # For each patch
                for p_idx in range(embs.shape[1]):
                    patch_embeddings.append(embs[idx, p_idx, :])
                    patch_labels.append(labels[idx])
            # Else we want to keep track of patches that belong in a single image
            else:
                image_embs = []
                image_labels = []
                for p_idx in range(embs.shape[1]):
                    image_embs.append(embs[idx, p_idx, :])
                    image_labels.append(labels[idx])

                # Now add to the original lists
                patch_embeddings.append(image_embs)
                patch_labels.append(image_labels)

        if not is_test and cnt == 70:
            break
    
    return patch_embeddings, patch_labels

## Load the VIT model

To load the model we will use the `timm` library.

We could also use the `transformers` library and more specific the `ViTForImageClassification`.

In [36]:
model = timm.create_model(
    "vit_tiny_patch16_224",  # Pre-trained ViT-Tiny on ImageNet-1k
    pretrained=True,        # Load pre-trained weights
    num_classes=10          # Adapt classifier head to CIFAR-10 (10 classes)
)

model = model.to(device)

# We dont want to train here so we can freeze all the layers
for param in model.parameters():
    param.requires_grad = False

## Generate CLS Tokens for the Train Set

In [14]:
train_cls_tokens, train_labels = get_dataset_cls_tokens(model, train_loader)

Calculating CLS Tokens: 100%|██████████| 704/704 [02:57<00:00,  3.96it/s]


## Generate CLS Tokens for the Test Set

In [15]:
test_cls_tokens, test_labels = get_dataset_cls_tokens(model, test_loader)

Calculating CLS Tokens: 100%|██████████| 157/157 [00:47<00:00,  3.33it/s]


## Generate Patch Embeddings for the Train Set

In [37]:
patch_embeddings, patch_labels = get_dataset_patch_embeddings(model=model, loader=train_loader)

Calculating Patch Embeddings:   0%|          | 0/704 [00:00<?, ?it/s]

Calculating Patch Embeddings:  10%|▉         | 70/704 [00:41<06:15,  1.69it/s]


In [38]:
print(f'Total patch embeddings in memory: {len(patch_embeddings)}')
print(f'Total images in memory: {len(patch_embeddings) / 196}')


Total patch embeddings in memory: 890624
Total images in memory: 4544.0


## Generate Patch Embeddings for the Test Set

In [39]:
test_patch_embeddings, test_patch_labels = get_dataset_patch_embeddings(model=model, loader=test_loader, is_test=True)

Calculating Patch Embeddings: 100%|██████████| 157/157 [00:57<00:00,  2.71it/s]


## Helper Method: Get Neighbors and Labels

This method will be used by other parts of the project. It returns two items:

1. `cls_tokens`: An array of shape (num_test, k): Each row contains the cls_token that corresponds the the neighbor.
2. `labels`: An array of shape (num_test, k): Each row contains the labels of those top-k images.

In [40]:
from sklearn.neighbors import NearestNeighbors

def get_neighbors_and_labels(distance: str, train_data: np.array, train_labels: list, test_data: list, top_k: int = 5):
    # Initialize the knn
    knn = NearestNeighbors(n_neighbors=top_k, algorithm="brute", metric=distance)
    
    print("Will set labels to cpu")
    clean_labels = []
    for label in tqdm(
        train_labels, total=len(train_labels), desc="Converting labels to cpu"
    ):
        clean_labels.append(label.to("cpu"))

    # Create a numpy array from the list
    print(f"Total train images: {len(train_data)}")
    train_data = np.stack(train_data, axis=0)
    clean_labels = np.stack(clean_labels, axis=0)
    print(f"Train shape: {train_data.shape}")
    print(f"Train labels shape: {clean_labels.shape}")
    
    # Fit with the train set
    knn.fit(train_data)
    
    # Create a numpy array for the test images
    print(f'Total test images: {len(test_data)}')
    test_data = np.stack(test_data, axis=0)
    print(f'Test shape: {test_data.shape}')
    
    # Apply the knn
    distances, indexes = knn.kneighbors(test_data, return_distance=True)
    
    print(f'Distances: {distances.shape}')
    print(f'Indexes: {distances.shape}')
    
    neighbor_labels = []
    neighbor_cls_tokens = []
    # Gather the final labels
    # And the cls_tokens
    # For each test image
    for i in tqdm(range(indexes.shape[0]), desc='Gathering results'):
        # Get the classes of the top_k
        classes = clean_labels[indexes[i]]
        neighbor_labels.append(classes)
        
        # Get the cls_tokens of the top_k
        cls_tokens = train_data[indexes[i]]
        neighbor_cls_tokens.append(cls_tokens)
    
    # finally convert the the labels to array
    neighbor_labels = np.stack(neighbor_labels, axis=0)
    # Do the same for the cls
    neighbor_cls_tokens = np.stack(neighbor_cls_tokens, axis=0)
    
    return neighbor_cls_tokens, neighbor_labels
        

## Helper Method: Get Neighbor Distances and Labels

This method will be used by other parts of the project. It returns two items:

1. `distances`: An array of shape (num_test, k): Each row contains the distances of the top-k similar images from the train set.
2. `labels`: An array of shape (num_test, k): Each row contains the labels of those top-k images.



In [41]:
from sklearn.neighbors import NearestNeighbors

def get_neighbors_distances_and_labels(distance: str, train_data: list, train_labels: np.array, test_data: np.array, top_k: int = 5):
    # Initialize the knn
    knn = NearestNeighbors(n_neighbors=top_k, algorithm="brute", metric=distance)
    
    # Create a numpy array from the list
    print(f'Total train images: {len(train_data)}')
    train_data = np.stack(train_data, axis=0)
    print(f'Train shape: {train_data.shape}')
    
    # Fit with the train set
    knn.fit(train_data)
    
    # Create a numpy array for the test images
    print(f'Total test images: {len(test_data)}')
    test_data = np.stack(test_data, axis=0)
    print(f'Test shape: {test_data.shape}')
    
    # Apply the knn
    distances, indexes = knn.kneighbors(test_data, return_distance=True)
    
    print(f'Distances: {distances.shape}')
    print(f'Indexes: {distances.shape}')
    
    neighbor_labels = []
    # Gather the final labels
    # For each test image
    for i in tqdm(range(indexes.shape[0]), desc='Gathering results'):
        # Get the classes of the top_k
        classes = train_labels[indexes[i]]
        neighbor_labels.append(classes)
    
    # finally convert the the labels to array
    neighbor_labels = np.stack(neighbor_labels, axis=0)

    return distances, neighbor_labels
        

## Helper Method: Get Neighbors and Labels Patch

Generate and return __for each patch of a test image__:

1. The top_k similar patches from the memory
2. The class of each top_k similar patch

In [42]:
from sklearn.neighbors import NearestNeighbors


def get_neigbors_and_labels_patch(
    distance: str,
    train_data: list,
    train_labels: np.array,
    test_data: list,
    top_k: int = 5,
):
    # Initialize the knn
    knn = NearestNeighbors(n_neighbors=top_k, algorithm="brute", metric=distance)

    # Move train_data back to cpu
    # train_data = [img.to('cpu') for img in train_data]
    print("Will set labels to cpu")
    clean_labels = []
    for label in tqdm(
        train_labels, total=len(train_labels), desc="Converting labels to cpu"
    ):
        clean_labels.append(label.to("cpu"))

    # Create a numpy array from the list
    print(f"Total train images: {len(train_data)}")
    train_data = np.stack(train_data, axis=0)
    train_labels = np.stack(clean_labels, axis=0)
    del clean_labels
    print(f"Train shape: {train_data.shape}")
    print(f"Train labels shape: {train_labels.shape}")

    # Fit with the train set
    print("Will fit train data to KNN")
    knn.fit(train_data)

    # Create a numpy array for the test images
    print(f"Total test images: {len(test_data)}")
    test_data = np.stack(test_data, axis=0)
    print(f"Test shape: {test_data.shape}")

    neighbor_patches = []
    neighbor_classes = []
    # For each image
    for i in tqdm(range(test_data.shape[0]), desc="Gathering results"):
        # Apply the knn for each patch
        # Note that the initial test array has a shape of
        # (num_test, num_image_patches, hidden_dim)
        b_patch_distances, b_patch_indexes = knn.kneighbors(
            test_data[i, :, :], return_distance=True
        )

        # These have a shape of (num_patches, k)

        # We now want to find the k classes for each test patch neighbors
        b_patch_classes = []
        b_patches = []

        # Now for each patch we have to get a class
        for p in range(b_patch_indexes.shape[0]):
            # Get the classes of the top_k
            classes = train_labels[b_patch_indexes[p, :]]
            # Get the actual patches of the top_k
            patches = train_data[b_patch_indexes[p, :]]

            b_patch_classes.append(classes)
            b_patches.append(patches)

        # Now we can add all the neighbor distances and their classes to our final lists
        neighbor_patches.append(b_patches)
        neighbor_classes.append(b_patch_classes)

    # finally convert the lists to arrays
    # Again, each list element has a shape of (k,)
    # In patch distances it's the distance of each neighbor
    # In patch classes it's the class of each neighbor
    # Both are with respect to a SINGLE test patch
    # neighbor_patches = np.stack(neighbor_patches, axis=0)
    # neighbor_classes = np.stack(neighbor_classes, axis=0)

    return neighbor_patches, neighbor_classes

## Helper Method: Create .npz files with the test images and the neighbors

In [43]:
def save_patch_memory(
    output_folder: str,
    file_name: str,
    test_patch,
    test_patch_labels,
    test_patch_neighbors,
    test_patch_neighbor_labels,
):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Define the save file path
    save_path = os.path.join(output_folder, file_name + ".npz")

    # Save the arrays
    np.savez(
        save_path,
        test_patch=test_patch,
        test_labels=test_patch_labels,
        neighbor_path=test_patch_neighbors,
        neighbor_labels=test_patch_neighbor_labels,
    )
    print(f"Data saved successfully to {save_path}")

def save_cls_memory(
    output_folder: str,
    file_name: str,
    test_cls,
    test_cls_labels,
    test_cls_neighbors,
    test_cls_neighbor_labels,
):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Define the save file path
    save_path = os.path.join(output_folder, file_name + ".npz")

    # Save the arrays
    np.savez(
        save_path,
        test_cls=test_cls,
        test_labels=test_cls_labels,
        neighbor_cls=test_cls_neighbors,
        neighbor_labels=test_cls_neighbor_labels,
    )
    print(f"Data saved successfully to {save_path}")

## Brute Force KNN - WITHOUT Test Prediction



### Using CLS Embeddings

In the following cells we will try to find the labels from the test images by doing the following:

1) Use the `ViT` model to extract the cls_token for the test image
2) Find the `topK` similar cls_tokens from the train dataset
3) Assign the `y_pred` to the majority class from the KNN

__This method will act as our baseline__ as it does not require a finetuned Visual Transformer.

The method bellow implements the brute force KNN, using the CLS embeddings for each image.

In [23]:
from sklearn.neighbors import NearestNeighbors

def brute_force_knn(distance: str, train_data: np.array, train_labels: np.array, test_data: np.array, top_k: int = 5):
    # Initialize the knn
    knn = NearestNeighbors(n_neighbors=top_k, algorithm="brute", metric=distance)
    
    # Create a numpy array from the list
    print(f'Total train images: {len(train_data)}')
    train_data = np.stack(train_data, axis=0)
    print(f'Train shape: {train_data.shape}')

    # Fit with the train set
    knn.fit(train_data)
    
    # Create a numpy array for the test images
    print(f'Total test images: {len(test_data)}')
    test_data = np.stack(test_data, axis=0)
    print(f'Test shape: {test_data.shape}')
    
    # Apply the knn
    distances, indexes = knn.kneighbors(test_data, return_distance=True)
    
    print(f'Distances: {distances.shape}')
    print(f'Indexes: {distances.shape}')
    
    y_pred = []
    # Gather the final y_pred
    # For each test image
    for i in tqdm(range(indexes.shape[0]), desc='Gathering results'):
        # Get the classes of the top_k
        classes = train_labels[indexes[i]]
        # Select the majority as y_pred
        y_pred.append(most_common(classes.tolist()))
    
    # finally convert the y_pred to array
    y_pred = np.stack(y_pred, axis=0)

    return y_pred
        

#### Using Cosine Distance

In [24]:
# convert the labels to numpy first
clean_train_labels = np.stack([label.to('cpu') for label in train_labels], axis=0)
clean_test_labels = np.stack([label.to('cpu') for label in test_labels], axis=0)

for k in [3, 5, 7, 9, 11, 13, 15, 17, 19]:
    y_pred = brute_force_knn(distance="cosine", train_data=train_cls_tokens, train_labels=clean_train_labels, test_data=test_cls_tokens, top_k=k)
    print('----------------------------')
    print(f'COSINE - TOP_K: {k}')
    print(classification_report(clean_test_labels, y_pred))

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 3)
Indexes: (10000, 3)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 185482.80it/s]

----------------------------
COSINE - TOP_K: 3
              precision    recall  f1-score   support

           0       0.76      0.73      0.74      1000
           1       0.71      0.73      0.72      1000
           2       0.77      0.55      0.64      1000
           3       0.55      0.49      0.52      1000
           4       0.68      0.65      0.66      1000
           5       0.60      0.60      0.60      1000
           6       0.67      0.83      0.75      1000
           7       0.70      0.77      0.74      1000
           8       0.79      0.77      0.78      1000
           9       0.66      0.76      0.71      1000

    accuracy                           0.69     10000
   macro avg       0.69      0.69      0.69     10000
weighted avg       0.69      0.69      0.69     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 5)
Indexes: (10000, 5)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 284336.46it/s]

----------------------------
COSINE - TOP_K: 5
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      1000
           1       0.73      0.75      0.74      1000
           2       0.83      0.55      0.66      1000
           3       0.58      0.52      0.55      1000
           4       0.72      0.67      0.69      1000
           5       0.62      0.62      0.62      1000
           6       0.67      0.86      0.75      1000
           7       0.73      0.78      0.75      1000
           8       0.80      0.78      0.79      1000
           9       0.67      0.80      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.70     10000
weighted avg       0.71      0.71      0.70     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 7)
Indexes: (10000, 7)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 84807.94it/s]

----------------------------
COSINE - TOP_K: 7





              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1000
           1       0.73      0.76      0.74      1000
           2       0.85      0.54      0.66      1000
           3       0.59      0.54      0.56      1000
           4       0.73      0.67      0.70      1000
           5       0.64      0.64      0.64      1000
           6       0.66      0.87      0.75      1000
           7       0.74      0.78      0.76      1000
           8       0.82      0.79      0.81      1000
           9       0.67      0.81      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.72      0.71      0.71     10000
weighted avg       0.72      0.71      0.71     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 9)
Indexes: (10000, 9)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 217921.02it/s]

----------------------------
COSINE - TOP_K: 9
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1000
           1       0.74      0.76      0.75      1000
           2       0.86      0.51      0.64      1000
           3       0.59      0.54      0.56      1000
           4       0.73      0.68      0.71      1000
           5       0.64      0.64      0.64      1000
           6       0.65      0.88      0.75      1000
           7       0.74      0.78      0.76      1000
           8       0.82      0.79      0.80      1000
           9       0.66      0.81      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.72      0.71      0.71     10000
weighted avg       0.72      0.71      0.71     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 11)
Indexes: (10000, 11)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 131847.01it/s]

----------------------------
COSINE - TOP_K: 11
              precision    recall  f1-score   support

           0       0.79      0.74      0.77      1000
           1       0.75      0.76      0.75      1000
           2       0.88      0.52      0.65      1000
           3       0.59      0.54      0.57      1000
           4       0.74      0.68      0.70      1000
           5       0.65      0.65      0.65      1000
           6       0.65      0.88      0.75      1000
           7       0.74      0.79      0.76      1000
           8       0.82      0.79      0.80      1000
           9       0.66      0.82      0.73      1000

    accuracy                           0.72     10000
   macro avg       0.73      0.72      0.71     10000
weighted avg       0.73      0.72      0.71     10000

Total train images: 45000





Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 13)
Indexes: (10000, 13)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 205559.81it/s]

----------------------------
COSINE - TOP_K: 13
              precision    recall  f1-score   support

           0       0.80      0.75      0.77      1000
           1       0.75      0.76      0.76      1000
           2       0.88      0.51      0.65      1000
           3       0.60      0.55      0.57      1000
           4       0.74      0.68      0.71      1000
           5       0.64      0.64      0.64      1000
           6       0.65      0.89      0.75      1000
           7       0.74      0.78      0.76      1000
           8       0.82      0.79      0.80      1000
           9       0.66      0.82      0.73      1000

    accuracy                           0.72     10000
   macro avg       0.73      0.72      0.71     10000
weighted avg       0.73      0.72      0.71     10000

Total train images: 45000





Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 15)
Indexes: (10000, 15)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 85766.95it/s]

----------------------------
COSINE - TOP_K: 15
              precision    recall  f1-score   support

           0       0.80      0.75      0.77      1000
           1       0.75      0.77      0.76      1000
           2       0.91      0.50      0.64      1000
           3       0.59      0.54      0.56      1000
           4       0.73      0.67      0.70      1000
           5       0.64      0.65      0.64      1000
           6       0.64      0.90      0.75      1000
           7       0.74      0.78      0.76      1000
           8       0.82      0.79      0.81      1000
           9       0.66      0.83      0.74      1000

    accuracy                           0.72     10000
   macro avg       0.73      0.72      0.71     10000
weighted avg       0.73      0.72      0.71     10000

Total train images: 45000





Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 17)
Indexes: (10000, 17)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 155206.06it/s]

----------------------------
COSINE - TOP_K: 17
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1000
           1       0.75      0.76      0.76      1000
           2       0.91      0.49      0.64      1000
           3       0.59      0.55      0.57      1000
           4       0.73      0.67      0.70      1000
           5       0.66      0.64      0.65      1000
           6       0.64      0.90      0.75      1000
           7       0.74      0.78      0.76      1000
           8       0.82      0.79      0.80      1000
           9       0.67      0.83      0.74      1000

    accuracy                           0.72     10000
   macro avg       0.73      0.72      0.71     10000
weighted avg       0.73      0.72      0.71     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000





Test shape: (10000, 192)
Distances: (10000, 19)
Indexes: (10000, 19)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 89804.94it/s]

----------------------------
COSINE - TOP_K: 19
              precision    recall  f1-score   support

           0       0.79      0.74      0.77      1000
           1       0.74      0.76      0.75      1000
           2       0.91      0.49      0.64      1000
           3       0.60      0.55      0.57      1000
           4       0.74      0.67      0.70      1000
           5       0.64      0.64      0.64      1000
           6       0.64      0.89      0.74      1000
           7       0.74      0.78      0.76      1000
           8       0.81      0.79      0.80      1000
           9       0.66      0.83      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.73      0.71      0.71     10000
weighted avg       0.73      0.71      0.71     10000






#### Using Euclidian Distance

In [25]:
# convert the labels to numpy first
clean_train_labels = np.stack([label.to('cpu') for label in train_labels], axis=0)
clean_test_labels = np.stack([label.to('cpu') for label in test_labels], axis=0)

for k in [3, 5, 7, 9, 11, 13, 15, 17, 19]:
    y_pred = brute_force_knn(distance="euclidean", train_data=train_cls_tokens, train_labels=clean_train_labels, test_data=test_cls_tokens, top_k=k)
    print('----------------------------')
    print(f'EUCLIDEAN - TOP_K: {k}')
    print(classification_report(clean_test_labels, y_pred))

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 3)
Indexes: (10000, 3)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 183103.01it/s]

----------------------------
EUCLIDEAN - TOP_K: 3
              precision    recall  f1-score   support

           0       0.74      0.71      0.73      1000
           1       0.71      0.70      0.70      1000
           2       0.75      0.54      0.63      1000
           3       0.53      0.48      0.50      1000
           4       0.66      0.66      0.66      1000
           5       0.58      0.58      0.58      1000
           6       0.68      0.82      0.74      1000
           7       0.71      0.75      0.73      1000
           8       0.75      0.77      0.76      1000
           9       0.65      0.74      0.69      1000

    accuracy                           0.67     10000
   macro avg       0.68      0.67      0.67     10000
weighted avg       0.68      0.67      0.67     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 5)
Indexes: (10000, 5)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 371466.63it/s]

----------------------------
EUCLIDEAN - TOP_K: 5
              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1000
           1       0.74      0.71      0.73      1000
           2       0.79      0.52      0.63      1000
           3       0.57      0.51      0.54      1000
           4       0.69      0.68      0.69      1000
           5       0.61      0.62      0.62      1000
           6       0.68      0.86      0.76      1000
           7       0.73      0.76      0.74      1000
           8       0.76      0.78      0.77      1000
           9       0.67      0.79      0.72      1000

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.69     10000
weighted avg       0.70      0.70      0.69     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 7)
Indexes: (10000, 7)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 357129.21it/s]

----------------------------
EUCLIDEAN - TOP_K: 7
              precision    recall  f1-score   support

           0       0.78      0.73      0.75      1000
           1       0.77      0.73      0.75      1000
           2       0.83      0.51      0.63      1000
           3       0.58      0.53      0.55      1000
           4       0.69      0.69      0.69      1000
           5       0.63      0.62      0.63      1000
           6       0.68      0.87      0.76      1000
           7       0.73      0.76      0.74      1000
           8       0.77      0.81      0.79      1000
           9       0.67      0.81      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.70     10000
weighted avg       0.71      0.71      0.70     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 9)
Indexes: (10000, 9)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 317396.84it/s]

----------------------------
EUCLIDEAN - TOP_K: 9
              precision    recall  f1-score   support

           0       0.78      0.73      0.76      1000
           1       0.76      0.73      0.74      1000
           2       0.84      0.50      0.63      1000
           3       0.59      0.54      0.56      1000
           4       0.68      0.69      0.68      1000
           5       0.63      0.63      0.63      1000
           6       0.67      0.87      0.76      1000
           7       0.73      0.76      0.75      1000
           8       0.78      0.80      0.79      1000
           9       0.67      0.82      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.70     10000
weighted avg       0.71      0.71      0.70     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 11)
Indexes: (10000, 11)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 246390.41it/s]

----------------------------
EUCLIDEAN - TOP_K: 11
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1000
           1       0.76      0.73      0.75      1000
           2       0.85      0.49      0.62      1000
           3       0.59      0.53      0.56      1000
           4       0.68      0.69      0.69      1000
           5       0.63      0.63      0.63      1000
           6       0.67      0.87      0.75      1000
           7       0.74      0.77      0.76      1000
           8       0.77      0.80      0.78      1000
           9       0.67      0.81      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.70     10000
weighted avg       0.71      0.71      0.70     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 13)
Indexes: (10000, 13)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 171473.24it/s]

----------------------------
EUCLIDEAN - TOP_K: 13
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1000
           1       0.77      0.73      0.75      1000
           2       0.85      0.49      0.62      1000
           3       0.59      0.54      0.56      1000
           4       0.71      0.69      0.70      1000
           5       0.63      0.65      0.64      1000
           6       0.65      0.87      0.75      1000
           7       0.75      0.77      0.76      1000
           8       0.77      0.80      0.78      1000
           9       0.66      0.82      0.73      1000

    accuracy                           0.71     10000
   macro avg       0.72      0.71      0.70     10000
weighted avg       0.72      0.71      0.70     10000

Total train images: 45000





Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 15)
Indexes: (10000, 15)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 297495.80it/s]

----------------------------
EUCLIDEAN - TOP_K: 15
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1000
           1       0.76      0.72      0.74      1000
           2       0.85      0.48      0.61      1000
           3       0.58      0.52      0.55      1000
           4       0.70      0.68      0.69      1000
           5       0.63      0.64      0.63      1000
           6       0.65      0.87      0.75      1000
           7       0.74      0.77      0.75      1000
           8       0.77      0.82      0.79      1000
           9       0.66      0.82      0.73      1000

    accuracy                           0.70     10000
   macro avg       0.71      0.70      0.70     10000
weighted avg       0.71      0.70      0.70     10000

Total train images: 45000
Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)





Distances: (10000, 17)
Indexes: (10000, 17)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 108799.12it/s]

----------------------------
EUCLIDEAN - TOP_K: 17
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      1000
           1       0.77      0.71      0.74      1000
           2       0.86      0.48      0.62      1000
           3       0.59      0.53      0.55      1000
           4       0.70      0.68      0.69      1000
           5       0.62      0.64      0.63      1000
           6       0.65      0.87      0.74      1000
           7       0.74      0.76      0.75      1000
           8       0.77      0.81      0.79      1000
           9       0.66      0.83      0.74      1000

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.70     10000
weighted avg       0.71      0.71      0.70     10000

Total train images: 45000





Train shape: (45000, 192)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 19)
Indexes: (10000, 19)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 221297.93it/s]

----------------------------
EUCLIDEAN - TOP_K: 19
              precision    recall  f1-score   support

           0       0.77      0.74      0.76      1000
           1       0.77      0.72      0.74      1000
           2       0.87      0.46      0.60      1000
           3       0.59      0.53      0.56      1000
           4       0.69      0.67      0.68      1000
           5       0.63      0.66      0.64      1000
           6       0.64      0.87      0.74      1000
           7       0.75      0.76      0.76      1000
           8       0.77      0.81      0.79      1000
           9       0.67      0.84      0.74      1000

    accuracy                           0.70     10000
   macro avg       0.71      0.70      0.70     10000
weighted avg       0.71      0.70      0.70     10000






### Using Patch Embeddings

In the following cells we will try to find the labels from the test images by doing the following:

1) Use the `ViT` model to extract the patch_embeddings for the test image
2) Find the `topK` similar patch embeddings for __each patch of the test image__
3) Find a `y_pred` for __each patch of the test image__
4) Assign the final y_pred to the majority
5) 
__This method will act as our baseline__ as it does not require a finetuned Visual Transformer.

The method bellow implements the brute force KNN, using the CLS embeddings for each image.

In [44]:
from sklearn.neighbors import NearestNeighbors

def patch_brute_force_knn(distance: str, train_data: list, train_labels: np.array, test_data: list, top_k: int = 5):
    # Initialize the knn
    knn = NearestNeighbors(n_neighbors=top_k, algorithm="brute", metric=distance)
    
    # Move train_data back to cpu
    # train_data = [img.to('cpu') for img in train_data]
    print('Will set labels to cpu')
    clean_labels = []
    for label in tqdm(train_labels, total=len(train_labels), desc="Converting labels to cpu"):
        clean_labels.append(label.to('cpu'))

    # Create a numpy array from the list
    print(f'Total train images: {len(train_data)}')
    train_data = np.stack(train_data, axis=0)
    clean_labels = np.stack(clean_labels, axis=0)
    print(f'Train shape: {train_data.shape}')
    print(f'Train labels shape: {clean_labels.shape}')
    
    # Fit with the train set
    print('Will fit train data to KNN')
    knn.fit(train_data)
    
    # Create a numpy array for the test images
    print(f'Total test images: {len(test_data)}')
    test_data = np.stack(test_data, axis=0)
    print(f'Test shape: {test_data.shape}')

    y_pred = []
    # For each image
    for i in tqdm(range(test_data.shape[0]), desc='Gathering results'):
        # Apply the knn for each patch
        # Note that the initial test array has a shape of
        # (num_test, num_image_patches, hidden_dim)
        patch_distances, patch_indexes = knn.kneighbors(test_data[i, :, :], return_distance=True)
        # print(patch_distances.shape)
        # print(patch_indexes.shape)
        patch_y_pred = []
        # Now for each patch we have to get a class
        for p in range(patch_indexes.shape[0]):
            # Get the classes of the top_k
            # print(patch_indexes[p])
            # print(clean_labels[patch_indexes[p]])
            classes = clean_labels[patch_indexes[p, :]]
            # Select the majority and keep it in the list
            # print(most_common(classes.tolist()))
            patch_y_pred.append(most_common(classes.tolist()))

        # print(patch_y_pred)
        # print(len(patch_y_pred))
        # Now from the patch_y_pred get the majority as the final class label
        y_pred.append(most_common(patch_y_pred))

    # finally convert the y_pred to array
    y_pred = np.stack(y_pred, axis=0)

    return y_pred
        

#### Using Cosine

In [56]:
results = patch_brute_force_knn("cosine", patch_embeddings, patch_labels, test_patch_embeddings, 5)

Will set labels to cpu


Converting labels to cpu:   0%|          | 964/890624 [00:00<05:34, 2660.37it/s]

Converting labels to cpu: 100%|██████████| 890624/890624 [02:22<00:00, 6247.83it/s]


Total train images: 890624
Train shape: (890624, 192)
Train labels shape: (890624,)
Will fit train data to KNN
Total test images: 10000
Test shape: (10000, 196, 192)


Gathering results: 100%|██████████| 10000/10000 [8:34:05<00:00,  3.08s/it]  


In [None]:
test_patch_labels = [ label[0].to('cpu') for label in test_patch_labels ]

print(classification_report(results, test_patch_labels))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      1010
           1       0.80      0.79      0.79      1010
           2       0.42      0.85      0.56       491
           3       0.46      0.64      0.53       724
           4       0.64      0.68      0.66       941
           5       0.67      0.64      0.65      1041
           6       0.84      0.66      0.74      1286
           7       0.78      0.69      0.74      1128
           8       0.77      0.81      0.79       949
           9       0.88      0.62      0.73      1420

    accuracy                           0.70     10000
   macro avg       0.70      0.71      0.69     10000
weighted avg       0.73      0.70      0.70     10000



#### Using Euclidean

In [17]:
results = patch_brute_force_knn("euclidean", patch_embeddings, patch_labels, test_patch_embeddings, 5)

Will set labels to cpu


Converting labels to cpu: 100%|██████████| 890624/890624 [02:27<00:00, 6051.33it/s]


Total train images: 890624
Train shape: (890624, 192)
Train labels shape: (890624,)
Will fit train data to KNN
Total test images: 10000
Test shape: (10000, 196, 192)


Gathering results: 100%|██████████| 10000/10000 [4:00:38<00:00,  1.44s/it] 


In [18]:
test_patch_labels = [ label[0].to('cpu') for label in test_patch_labels ]

print(classification_report(results, test_patch_labels))

              precision    recall  f1-score   support

           0       0.66      0.74      0.70       885
           1       0.78      0.77      0.78      1017
           2       0.48      0.80      0.60       606
           3       0.43      0.64      0.51       676
           4       0.56      0.74      0.63       754
           5       0.74      0.62      0.67      1197
           6       0.81      0.64      0.71      1271
           7       0.78      0.66      0.72      1180
           8       0.78      0.78      0.78       991
           9       0.90      0.63      0.74      1423

    accuracy                           0.69     10000
   macro avg       0.69      0.70      0.68     10000
weighted avg       0.73      0.69      0.70     10000



## Use Helper Methods

### Calculate Neighbors and Labels using CLS

The following cell performs multiple runs with a different `K` and does the following:

1. Calculates the neighbors and labels for each test image, __using the cls representation__

2. Saves the neighbor cls tokens, the neighbor labels, the test cls tokens and the test labels into a `.npz` file

THose `.npz` files are used in another notebook, to enchance the prediction of a trained Visual Transformer, and to finally calculate the predicted class for each of the test images

In [27]:
# We want to extract neighbors and classes for the following K
K = [3, 5, 7, 9, 11, 13, 15, 17, 19]
# We want each K to be tested with cosine and euclidean
distances = ["cosine"] * len(K) + ["euclidean"] * len(K)
# So we have to double the Ks
K = K * 2

output_folder = "./"

# Convert the test labels to cpu
cpu_test_labels = []
for label in tqdm(test_labels, desc='Converting test labels to cpu'):
    cpu_test_labels.append(label.to('cpu'))

# Now also convert to numpy array
cpu_test_labels = np.stack(cpu_test_labels, axis=0)

for distance, k in zip(distances, K):
    
    print(f'Processing k: {k} - Distances: {distance}')

    # calculate
    neighbor_cls, neighbor_labels = get_neighbors_and_labels(
        distance=distance,
        train_data=train_cls_tokens,
        train_labels=train_labels,
        test_data=test_cls_tokens,
        top_k=k,
    )

    file_name = f"cls_neighbors_{distance}_{k}"

    test_cls_tokens_arr = np.stack(test_cls_tokens, axis=0)

    # now save to npz file
    save_cls_memory(
        output_folder=output_folder,
        file_name=file_name,
        test_cls=test_cls_tokens_arr,
        test_cls_labels=cpu_test_labels,
        test_cls_neighbors=neighbor_cls,
        test_cls_neighbor_labels=neighbor_labels,
    )

    # if distance == 'euclidean':
        # break


# print(f'Neighbor CLS: {neighbor_cls.shape}')
# print(f'Neighbor Labels: {neighbor_labels.shape}')

Converting test labels to cpu:   7%|▋         | 723/10000 [00:00<00:02, 3714.77it/s]

Converting test labels to cpu: 100%|██████████| 10000/10000 [00:02<00:00, 3741.11it/s]


Processing k: 3 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:06<00:00, 6823.83it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 3)
Indexes: (10000, 3)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 227508.65it/s]


Data saved successfully to ./cls_neighbors_cosine_3.npz
Processing k: 5 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:15<00:00, 2968.29it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 5)
Indexes: (10000, 5)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 83647.18it/s]


Data saved successfully to ./cls_neighbors_cosine_5.npz
Processing k: 7 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:12<00:00, 3537.36it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 7)
Indexes: (10000, 7)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 62275.12it/s]


Data saved successfully to ./cls_neighbors_cosine_7.npz
Processing k: 9 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:14<00:00, 3135.20it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 9)
Indexes: (10000, 9)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 61076.46it/s]


Data saved successfully to ./cls_neighbors_cosine_9.npz
Processing k: 11 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:12<00:00, 3723.46it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 11)
Indexes: (10000, 11)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 63163.61it/s]


Data saved successfully to ./cls_neighbors_cosine_11.npz
Processing k: 13 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:12<00:00, 3581.15it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 13)
Indexes: (10000, 13)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 50723.66it/s]


Data saved successfully to ./cls_neighbors_cosine_13.npz
Processing k: 15 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:13<00:00, 3228.23it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 15)
Indexes: (10000, 15)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 44411.38it/s]


Data saved successfully to ./cls_neighbors_cosine_15.npz
Processing k: 17 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:10<00:00, 4138.43it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 17)
Indexes: (10000, 17)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 60593.73it/s]


Data saved successfully to ./cls_neighbors_cosine_17.npz
Processing k: 19 - Distances: cosine
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:11<00:00, 3803.41it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 19)
Indexes: (10000, 19)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 36863.67it/s]


Data saved successfully to ./cls_neighbors_cosine_19.npz
Processing k: 3 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:13<00:00, 3248.02it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 3)
Indexes: (10000, 3)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 153114.23it/s]


Data saved successfully to ./cls_neighbors_euclidean_3.npz
Processing k: 5 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:11<00:00, 3983.49it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 5)
Indexes: (10000, 5)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 105939.51it/s]


Data saved successfully to ./cls_neighbors_euclidean_5.npz
Processing k: 7 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:09<00:00, 4747.89it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 7)
Indexes: (10000, 7)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 117148.20it/s]


Data saved successfully to ./cls_neighbors_euclidean_7.npz
Processing k: 9 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:10<00:00, 4139.44it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 9)
Indexes: (10000, 9)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 101198.03it/s]


Data saved successfully to ./cls_neighbors_euclidean_9.npz
Processing k: 11 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:10<00:00, 4094.27it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 11)
Indexes: (10000, 11)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 53447.99it/s]


Data saved successfully to ./cls_neighbors_euclidean_11.npz
Processing k: 13 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:10<00:00, 4264.78it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 13)
Indexes: (10000, 13)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 54808.45it/s]


Data saved successfully to ./cls_neighbors_euclidean_13.npz
Processing k: 15 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:09<00:00, 4596.71it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 15)
Indexes: (10000, 15)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 60011.13it/s]


Data saved successfully to ./cls_neighbors_euclidean_15.npz
Processing k: 17 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:10<00:00, 4170.03it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 17)
Indexes: (10000, 17)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 62108.67it/s]


Data saved successfully to ./cls_neighbors_euclidean_17.npz
Processing k: 19 - Distances: euclidean
Will set labels to cpu


Converting labels to cpu: 100%|██████████| 45000/45000 [00:09<00:00, 4932.91it/s]


Total train images: 45000
Train shape: (45000, 192)
Train labels shape: (45000,)
Total test images: 10000
Test shape: (10000, 192)
Distances: (10000, 19)
Indexes: (10000, 19)


Gathering results: 100%|██████████| 10000/10000 [00:00<00:00, 61543.92it/s]


Data saved successfully to ./cls_neighbors_euclidean_19.npz


### Calculate Neighbors and Labels using Patch

#### Cosine

In [None]:
neighbors, classes = get_neigbors_and_labels_patch(
    "cosine", patch_embeddings, patch_labels, test_patch_embeddings, 5
)

print(f"Neighbors: {neighbors.shape}")
print(f"Classes: {classes.shape}")

In [None]:
import pickle

with open('./euclidian_patch_classes.pkl', 'rb') as f:
    classes = pickle.load(f)

with open('./euclidian_patch_neighbors.pkl', 'rb') as f:
    neighbors = pickle.load(f)

In [10]:
neighbors = np.stack(neighbors, axis=0)
classes = np.stack(classes, axis=0)

print(f"Neighbors: {neighbors.shape}")
print(f"Classes: {classes.shape}")

Neighbors: (10000, 196, 5, 192)
Classes: (10000, 196, 5)


In [None]:
# Convert the test labels to cpu and change them to (test_size,)
clean_test_labels = []
for image in tqdm(test_patch_labels, desc='Converting test labels to cpu'):
    # We can do this here as all the patches of a test image have the same label
    clean_test_labels.append(image[0].to('cpu'))

clean_test_labels = np.stack(clean_test_labels, axis=0)

# Convert the patch embeddings to array
test_patches = np.stack(test_patch_embeddings, axis=0)

print(f'Test patches: {test_patches.shape}')
print(f'Test labels: {clean_test_labels.shape}')
print(f"Neighbors: {neighbors.shape}")
print(f"Classes: {classes.shape}")

save_patch_memory(output_folder='./', file_name='patch_memory', test_patch=test_patches, test_patch_labels=clean_test_labels, test_patch_neighbors=neighbors, test_patch_neighbor_labels=classes)

#### Euclidian

In [22]:
neighbors, classes = get_neigbors_and_labels_patch(
    "euclidean", patch_embeddings, patch_labels, test_patch_embeddings, 5
)

Will set labels to cpu


Converting labels to cpu: 100%|██████████| 890624/890624 [02:05<00:00, 7086.48it/s]


Total train images: 890624
Train shape: (890624, 192)
Train labels shape: (890624,)
Will fit train data to KNN
Total test images: 10000
Test shape: (10000, 196, 192)


Gathering results: 100%|██████████| 10000/10000 [4:20:04<00:00,  1.56s/it] 


In [23]:
import pickle

with open('./euclidian_patch_neighbors.pkl', 'wb') as f:
    pickle.dump(neighbors, f)

with open('./euclidian_patch_classes.pkl', 'wb') as f:
    pickle.dump(classes, f)

In [2]:
import pickle

with open('./euclidian_patch_neighbors.pkl', 'rb') as f:
    neighbors = pickle.load(f)

with open('./euclidian_patch_classes.pkl', 'rb') as f:
    classes = pickle.load(f)

In [24]:
neighbors = np.stack(neighbors, axis=0)
classes = np.stack(classes, axis=0)

print(f"Neighbors: {neighbors.shape}")
print(f"Classes: {classes.shape}")

Neighbors: (10000, 196, 5, 192)
Classes: (10000, 196, 5)


save the patches

In [24]:
# Convert the test labels to cpu and change them to (test_size,)
clean_test_labels = []
for image in tqdm(test_patch_labels, desc='Converting test labels to cpu'):
    # We can do this here as all the patches of a test image have the same label
    clean_test_labels.append(image[0].to('cpu'))

clean_test_labels = np.stack(clean_test_labels, axis=0)

# Convert the patch embeddings to array
test_patches = np.stack(test_patch_embeddings, axis=0)

print(f'Test patches: {test_patches.shape}')
print(f'Test labels: {clean_test_labels.shape}')
# print(f"Neighbors: {neighbors.shape}")
# print(f"Classes: {classes.shape}")

save_patch_memory(output_folder='./', file_name='patch_memory_euclidian', test_patch=test_patches, test_patch_labels=clean_test_labels, test_patch_neighbors=neighbors, test_patch_neighbor_labels=classes)

Converting test labels to cpu: 100%|██████████| 10000/10000 [00:03<00:00, 2863.21it/s]


Test patches: (10000, 196, 192)
Test labels: (10000,)
Data saved successfully to ./patch_memory_euclidian.npz


Load the saved file to verify the sizes are the expected ones

In [None]:
with np.load('/home/panos/WSL_projects/rag-enhanced-image-classification/src/development/patch_memory.npz') as data:
    tp = data['test_patch']
    tl = data['test_labels']
    n=data['neighbor_path']
    c=data['neighbor_labels']

print(f'Test patches: {tp.shape}')
print(f'Test labels: {tl.shape}')
print(f"Neighbors: {n.shape}")
print(f"Classes: {c.shape}")