# Evaluate test set

In [1]:
import torch
from d2l import torch as d2l
import pandas as pd
import numpy as np
from torchmetrics.classification import MultilabelAUROC, BinaryAUROC
from torch.utils.data import DataLoader
import sys
import os

sys.path.append("C:\\Users\\niels\\Documents\\Github\\Birds-NL-PAM\\")

import config as cfg
from Dataset import EvaluationDataset
from Classifier import Classifier

DIR = cfg.DATASET_DIR

In [5]:
def get_sample_predictions(sample_scores: np.ndarray, thresholds):
    return torch.where(sample_scores >= thresholds, 1, 0)


def evaluate_samples(sample_scores, sample_targets):
    n_samples = len(sample_scores)
    n_classes = len(sample_scores[0])

    predictions = get_sample_predictions(sample_scores, 0.50)

    tps = torch.sum(sample_targets * predictions, axis=0)
    fps = torch.sum(predictions, axis=0) - tps
    fns = torch.sum(sample_targets, axis=0) - tps

    print("tps", sum(tps))
    print("fps", sum(fps))
    print("fns", sum(fns))
    return tps, fps, fns


def store_predictions(net, device, species_list):
    net.eval()

    input_dir = "C:/Users/niels/Documents/Github/Birds-NL-PAM/dataset/test/embeddings/"
    input_files = next(os.walk(input_dir))[2]
    
    with torch.no_grad():
        # single example, multiple embeddings
        for file in input_files:
            # load embeddings
            X = torch.from_numpy(np.load(input_dir + file))
            # send batch samples to GPU
            X = X.to(device)
            # make predictions
            y_hat = net(X)
            # save predictions as npy
            sample_scores = y_hat.cpu().numpy()
            sample_scores = np.max(sample_scores, axis=0)
            
            output_path = "C:/Users/niels/Documents/Github/Birds-NL-PAM/dataset/test/own-predictions/" + file
            np.save(output_path, sample_scores)


def evaluate_val_data(net, val_data, device):
    net.eval()
    
    preds = []; target = []
    with torch.no_grad():
        # single example, multiple embeddings
        for X, y in val_data:
            # send batch samples to GPU
            X, y = X.to(device), y.to(device)
            # make predictions, get max pred for each class
            y_hat = net(X)
            y_hat = torch.amax(y_hat, axis=0)
            #print(torch.where(y > 0.5, y_hat, 0))
            # compute loss
            #l = loss(y_hat, y)
            # store metrics
            preds.append(y_hat)
            target.append(y)
    
    preds = torch.stack(preds)
    target = torch.where(torch.stack(target) >= 0.5, 1, 0)
    
    ml_auroc = BinaryAUROC()
    val_aucs = []
    n_classes = target.shape[-1]
    
    for i in range(n_classes):
        if torch.amax(target[:,i]) > 0.1:
            val_auc = ml_auroc(preds[:,i], target[:,i]).cpu()
            val_aucs.append(val_auc)

    print(sum(val_aucs) / len(val_aucs))

In [7]:
model_path = 'C:/Users/niels/Documents/Github/Birds-NL-PAM/train/model_nosec.pt'
model = torch.load(model_path).to(d2l.try_gpu())

species_list = pd.read_csv(cfg.SPECIES_FILE_PATH, index_col=0)
classes = species_list['latin_name'].values.tolist()
val_data = EvaluationDataset(classes, DIR, 'val')

sample_x, sample_y = next(iter(val_data))
print(sample_x.shape)
evaluate_val_data(model, val_data, d2l.try_gpu())
store_predictions(model, d2l.try_gpu(), species_list)


torch.Size([3, 1280])
tensor(0.9659)
