In [1]:
import torch
from data import get_dataloader
from utils import create_sampler
import numpy as np
from functions import *

In [2]:
# Data parameters
IBC_path = '/bigdisk2/nilearn_data/neurovault/collection_6618/'
split_dir = '../dataset/split/'

# Baseline: nearest class mean in the input space.

In [3]:
# Compute similarities between two sets of vectors using the Euclidean distance.
def simi_score(X, Y):
    """
    Return a score between 0 and 1 (0 for very similar, 1 for not similar at all)
    between all vectors in X and all vectors in Y.
    
    Parameters:
        X -- set of vectors (number of vectors, vector size).
        Y -- set of vectors (number of vectors, vector size).
    """
    X = X / torch.norm(X, dim=1, keepdim=True)
    Y = Y / torch.norm(Y, dim=1, keepdim=True)
    distances = torch.cdist(Y, X, p=2)
    similarities = 1 - torch.exp(-1*distances)
    return similarities

# Evaluate a model on the validation / test set.
def episodic_evaluation(data_loader, sampler_infos, use_cuda):
    """
    Return the average accuracy on few-shot tasks (called episodes).
    A task contains training samples with known labels and query
    samples. The accuracy is the number of times we correctly
    predict the labels of the query samples.
    
    A label is represented by its training examples. A new sample
    is labeled in function of the closest label-representative.
    """
    n_way = sampler_infos[1]
    n_shot = sampler_infos[2]
    epoch_acc = 0.
    total = 0.
    
    with torch.no_grad():
        # Iterate over several episodes.
        for i, (x, y) in enumerate(data_loader):
            # print(i, end='\r')
            if use_cuda:
                x = x.cuda()
                y = y.cuda()
            # Adapt the shape of the input.
            x = x.view(x.shape[0], -1)
            # Split the data into training samples and query samples.
            # Be careful: the data have to be sorted by split (train/query) and by classes. 
            training = x[:n_way*n_shot]
            query = x[n_way*n_shot:]
            train_labels = y[:n_way*n_shot]
            query_labels = y[n_way*n_shot:]
            del x

            # Compute the vector representative of each class.
            training = training.reshape(n_way, n_shot, -1).mean(1)
            train_labels = train_labels[::n_shot]

            # Find the labels of the query samples.
            scores = simi_score(training, query)
            pred_labels = torch.argmin(scores, dim=1)
            pred_labels = torch.take(train_labels, pred_labels)
            del training, query
            
            # Compute the accuracy.
            acc = (query_labels == pred_labels).float().sum()
            epoch_acc += acc
            total += query_labels.size(0)
    return epoch_acc / total

In [4]:
# Evaluate the baseline on the test set of parcellated images.
# Episodes parameters
parcel = True
n_episode = 10000
n_way = 5
n_shot = 1
n_query = 15
test_sampler_infos = [1, n_way, n_shot, n_query]
# Loader
test_loader = get_dataloader('test', IBC_path, parcel, split_dir, meta=True, sampler_infos=test_sampler_infos)

epoch_accs = []
for i in range(n_episode):
    print(i, end='\r')
    epoch_acc = episodic_evaluation(test_loader, test_sampler_infos, use_cuda=True)
    epoch_accs.append(epoch_acc.cpu().item())
mean, conf = compute_confidence_interval(epoch_accs)
print('The baseline has an average accuracy of {:.2f}% over {} tasks with 95% confidence interval {:.2f}.'.format(mean*100, n_episode, conf*100))

The baseline has an average accuracy of 57.33% over 10000 tasks with 95% confidence interval 0.20.
