In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import torch
from tqdm import tqdm 
from torch.utils.data import DataLoader
from torchmetrics.functional import average_precision
# 
from network import Model 
from datasets.video import VideoDataset
from datasets.description import DescriptionDataset


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Model().to(device)


500 out of 500 videos accepted in /home/nero/Courses/CS412/datasets/TRECVid Data/testing_set/Frames.
1790 descriptions of 500 videos accepted in /home/nero/Courses/CS412/datasets/TRECVid Data/testing_set.


In [10]:
from datasets.description import DescriptionDataset
from datasets.video_with_desc import VideoWithDescDataset
videoDataset = VideoWithDescDataset(root = "../../../images/TRECVid Data/testing_set", num_imgs = 8, subset = 'test', preprocess = model.preprocess)
descDataset = DescriptionDataset(root = "../../../images/TRECVid Data/testing_set", subset = 'test')

videoDataloader = DataLoader(videoDataset, batch_size = 8, shuffle = False, num_workers = 8)
descDataloader = DataLoader(descDataset, batch_size =8, shuffle = False, num_workers = 8)

def getVideoFeatures(model, dataloader):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for data in tqdm(dataloader):
            images = data['images'].to(device)           # B, T, C, H, W 
            vid_ids = data['video_id']   # B 

            # IMAGES AS VIDEO 
            num_imgs = data['num_imgs'][0]
            # print(images.shape)
            images = images.unsqueeze(2).repeat(1, 1, 3, 1, 1, 1)
            images[:, 2:, 0] = images[:, :-2, 2]
            images[:, 1:, 0] = images[:, :-1, 2]
            images = images.flatten(start_dim = 0, end_dim = 1) 
            # print(images.shape)
            features = model.encode_images(images)

            all_features.append(features)
            all_labels.append(vid_ids.unsqueeze(-1).repeat(1, num_imgs).view(-1)) 

    return torch.cat(all_features), torch.cat(all_labels)

def getTextFeatures(model, dataloader):
    all_features = []
    all_labels = []

    with torch.no_grad():
        for data in tqdm(dataloader):
            description = data['description'].to(device)
            vid_ids = data["video_id"]
            features = model.encode_text(description)
            
            all_features.append(features)
            all_labels.append(vid_ids) 
    return torch.cat(all_features), torch.cat(all_labels)

import time 
from tqdm.notebook import tqdm
from torchmetrics.functional import *
    

device = "cuda" if torch.cuda.is_available() else "cpu"
def evaluate(model):
    process_begin = time.time()
    # Calculate the image features
    videoFeatures, videoLabels = getVideoFeatures(model, videoDataloader)
    textFeatures, textLabel = getTextFeatures(model, descDataloader)
    target = textLabel.view(-1, 1) == videoLabels.view(1, -1)

    logits_per_image, logits_per_text = model.calc_similarity(videoFeatures, textFeatures)

    total_process_time = time.time() - process_begin
    print(f"Total process time: {total_process_time:03f}")

    mAP = torch.tensor([retrieval_average_precision(logits_per_text[i], target[i]) for i in range(logits_per_text.size(0))]).mean()
    print(f"Mean Average Precision: {mAP}")
    k = 15 
    recK = torch.tensor([retrieval_recall(logits_per_text[i], target[i], k) for i in range(logits_per_text.size(0))]).mean()
    print(f"Mean Recall@{k}: {recK}")
    rr  = torch.tensor([retrieval_reciprocal_rank(logits_per_text[i], target[i]) for i in range(logits_per_text.size(0))]).mean()
    print(f"Mean Reciprocal Rank: {rr}")


['00004', '00007', '00015', '00041', '00047', '00063', '00067', '00084', '00089', '00100', '00113', '00129', '00145', '00175', '00194', '00197', '00219', '00229', '00236', '00239', '00243', '00248', '00342', '00361', '00389', '00392', '00412', '00440', '00445', '00459', '00492', '00533', '00534', '00535', '00543', '00545', '00554', '00562', '00583', '00591']
500 out of 500 videos accepted in ../../../images/TRECVid Data/testing_set/Frames.
1790 descriptions of 500 videos accepted in ../../../images/TRECVid Data/testing_set.


In [15]:
model.eval()
# from torch import nn
print
evaluate(model)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:15<00:00,  4.19it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 224/224 [00:02<00:00, 103.73it/s]


Total process time: 17.191969
Mean Average Precision: 0.5955747365951538
Mean Recall@15: 0.6545391082763672
Mean Reciprocal Rank: 0.6981375217437744


In [7]:
logits_per_image, logits_per_text = model.calc_similarity(videoFeatures, textFeatures)

In [8]:
logits_per_text.shape

torch.Size([1790, 4000])

In [9]:
target = textLabel.view(-1, 1) == videoLabels.view(1, -1)

In [30]:
from torchmetrics.functional import *
print(f"Mean Average Precision: {retrieval_average_precision(logits_per_text, target)}")

Mean Average Precision: 0.39895907044410706


In [23]:

print(f"Mean Reciprocal Rank: {retrieval_recall(logits_per_text, target, k = 4000)}")

Mean Reciprocal Rank: 0.21305866539478302


In [16]:
from torch import tensor
a = tensor([0.5, 0.2, 0.3])
b = tensor([False, True ,False])
retrieval_recall(a, b, 2)

tensor(0.)

In [2]:
import clip

In [3]:
clip.available_models()

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']

In [36]:
mAP = 0
rec15 = 0
rank = 0
for i in range(logits_per_text.size(0)):
    # print(f"AP - {i} - {textLabel[i]}: {retrieval_average_precision(logits_per_text[i], target[i])}")
    mAP += retrieval_average_precision(logits_per_text[i], target[i])
    rec15 += retrieval_recall(logits_per_text[i], target[i], k = 15)
    rank += retrieval_reciprocal_rank(logits_per_text[i], target[i])
mAP /= logits_per_text.size(0)
rec15 /= logits_per_text.size(0)
rank /=logits_per_text.size(0)

In [37]:
print(mAP, rec15, rank)

tensor(0.5625) tensor(0.6269) tensor(0.6977)


In [28]:
retrieval_recall(logits_per_text[0], target[0], k = 15)

tensor(0.)

In [29]:
recall

<function torchmetrics.functional.classification.precision_recall.recall(preds: torch.Tensor, target: torch.Tensor, average: str = 'micro', mdmc_average: Union[str, NoneType] = None, ignore_index: Union[int, NoneType] = None, num_classes: Union[int, NoneType] = None, threshold: float = 0.5, top_k: Union[int, NoneType] = None, multiclass: Union[bool, NoneType] = None) -> torch.Tensor>