In [23]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import torch.nn.functional as F
from tqdm import tqdm
import random
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from datetime import datetime
from torchvision import models
from transformers import CLIPProcessor, CLIPModel
import os
import csv
from peft import LoraConfig, get_peft_model
import json
from ast import literal_eval

In [15]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root = "/content/drive/MyDrive/MASTER_THESIS/"
    model_root = root + "models/"
    onColab = True
except:
    root = "../../../"
    model_root = "../../../private_data/MODELS/"
    onColab = False

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [24]:
CAPTIONS_EN = pd.read_csv(root + "IconclassTest_Exploded/object_to_paths_en.csv", converters={'paths': literal_eval})
CAPTIONS_FR = pd.read_csv(root + "IconclassTest_Exploded/object_to_paths_fr.csv", converters={'paths': literal_eval})
CAPTIONS_NL = pd.read_csv(root + "IconclassTest_Exploded/object_to_paths_nl.csv", converters={'paths': literal_eval})

In [5]:
best_workers = 10
criterion = nn.CrossEntropyLoss()

In [6]:
model_name = "art-mini"
print(f"Running benchmark on: {model_name}")

Running benchmark on: art-mini


In [7]:
base_model_february_finetuned = "openai/clip-vit-large-patch14" # art-base
base_model_march_finetuned = "openai/clip-vit-large-patch14" # art-base

base_model_mini = "openai/clip-vit-base-patch32" # art-mini
base_model_base = "openai/clip-vit-large-patch14" # art-base
base_model_large = "openai/clip-vit-large-patch14-336" # art-large

basic_mini = "openai/clip-vit-base-patch32"
basic_base = "openai/clip-vit-large-patch14"
basic_large = "openai/clip-vit-large-patch14-336"

In [8]:
# Create folder "multilingual_folder"
results_path = root + "IconClassTestingScriptAndResults/results/"
if not os.path.exists(results_path):
    os.makedirs(results_path)

In [16]:
if model_name == "february_finetuned":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = model_root + "february_finetuned.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 32

elif model_name == "march_finetuned":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = model_root + "march_finetuned.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 32

elif model_name == "art-mini":
  processor = CLIPProcessor.from_pretrained(base_model_mini)
  model = CLIPModel.from_pretrained(base_model_mini).to(device)
  model_weights_path = model_root + "art-mini.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 256

elif model_name == "art-base":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = model_root + "art-base.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 32

elif model_name == "art-large":
  processor = CLIPProcessor.from_pretrained(base_model_large)
  model = CLIPModel.from_pretrained(base_model_large).to(device)
  model_weights_path = model_root + "art-large.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 16

elif model_name == "basic-mini":
  processor = CLIPProcessor.from_pretrained(basic_mini)
  model = CLIPModel.from_pretrained(basic_mini).to(device)
  BATCH_SIZE = 256

elif model_name == "basic-base":
  processor = CLIPProcessor.from_pretrained(basic_base)
  model = CLIPModel.from_pretrained(basic_base).to(device)
  BATCH_SIZE = 32

elif model_name == "basic-large":
  processor = CLIPProcessor.from_pretrained(basic_large)
  model = CLIPModel.from_pretrained(basic_large).to(device)
  BATCH_SIZE = 16

print(f"Running benchmark on: {model_name}")

if not onColab:
  BATCH_SIZE = 2
  CAPTIONS_EN = CAPTIONS_EN.head(5)
  CAPTIONS_FR = CAPTIONS_FR.head(5)
  CAPTIONS_NL = CAPTIONS_NL.head(5)

Running benchmark on: art-mini


In [27]:
type(CAPTIONS_EN["paths"][0]) # Should be "list"

list

In [None]:
class LanguageDataset(Dataset):
    def __init__(self, DATASET):
        self.DATASET = DATASET

    def __len__(self):
        return len(self.DATASET)

    def __getitem__(self, idx):
        # object	paths	object_tokens

        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.DATASET.iloc[idx]

        object = row["object"]
        selected_path = random.choice(row["paths"])
        path = root + "IconclassTest_Images/" + selected_path

        image = Image.open(path)

        return object, image

def customBatchBuilder(samples):
    objects, images = zip(*samples)
    inputs = processor(text=objects, images=images, return_tensors="pt", padding=True, truncation=True)
    return inputs

DATASETS = {
    "en": LanguageDataset(CAPTIONS_EN),
    "fr": LanguageDataset(CAPTIONS_FR),
    "nl": LanguageDataset(CAPTIONS_NL)
}
DATALOADERS = {}

random.seed(0) # To get reproducible results

for lang in DATASETS.keys():
    DATALOADER = DataLoader(
        DATASETS[lang],
        batch_size=BATCH_SIZE,
        shuffle=True,

        num_workers=best_workers,
        pin_memory=True,
        prefetch_factor=2,
        persistent_workers=True,

        collate_fn=customBatchBuilder
    )
    DATALOADERS[lang] = DATALOADER

DATALOADERS, DATASETS

({'en': <torch.utils.data.dataloader.DataLoader at 0x7f43ae355050>,
  'fr': <torch.utils.data.dataloader.DataLoader at 0x7f43b6900d10>,
  'nl': <torch.utils.data.dataloader.DataLoader at 0x7f43b6900610>},
 {'en': <__main__.LanguageDataset at 0x7f43ae3551d0>,
  'fr': <__main__.LanguageDataset at 0x7f43b690f590>,
  'nl': <__main__.LanguageDataset at 0x7f43b6900c10>})

In [None]:
def get_average_position(cosine_similarities):
    """
    What is the Average Position?
    ==> The Average Position is a metric that evaluates the effectiveness of a recommendation algorithm.
    ==> It is the average of the positions of the correct answers.
    ==> The position of a query response is the rank of the first correct answer.
    ==> The Average Position is a number between 0 and n, where 0 means that the first correct answer is always ranked first.
    """
    positions = []
    average_position = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        positions.append(rank)
        average_position += rank
    positions = [int(p) for p in positions]
    average_position /= len(cosine_similarities)
    return average_position, positions

def get_MRR(cosine_similarities):
    """
    What is MRR (Mean Reciprocal Rank)?
    ==> The Mean Reciprocal Rank is a metric that evaluates the effectiveness of a recommendation algorithm.
    ==> It is the average of the reciprocal ranks of the top k items.
    ==> The reciprocal rank of a query response is the multiplicative inverse of the rank of the first correct answer.
    ==> The MRR is a number between 0 and 1, where 1 means that the first correct answer is always ranked first.
    """
    mrr = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        mrr += 1 / rank
    mrr /= len(cosine_similarities)
    return mrr

def get_recall_at_k(cosine_similarities, k):
    """
    What is Recall@k?
    ==> The Recall@k is a metric that evaluates the effectiveness of a recommendation algorithm.
    ==> It is the proportion of the top k items that are relevant.
    ==> The Recall@k is a number between 0 and 1, where 1 means that all top k items are relevant.
    """
    recall_at_k = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        if i in sorted_indices[:k]:
            recall_at_k += 1
    recall_at_k /= len(cosine_similarities)
    return recall_at_k

def get_nDCG_at_k(cosine_similarities, k):
    """
    What is nDCG@k (Discounted cumulative gain)
    ==> The nDCG@k is a metric that evaluates the effectiveness of a recommendation algorithm.
    ==> It is the normalized discounted cumulative gain at the top k items.
    ==> The nDCG@k is a number between 0 and 1, where 1 means that all top k items are relevant and perfectly ranked.
    """
    nDCG_at_k = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        nDCG_at_k += 1 / np.log2(rank + 1) if rank <= k else 0
    nDCG_at_k /= len(cosine_similarities)
    return nDCG_at_k

In [None]:
def benchmark_on_dataloader(model, dataset, dataloader, device):
    """
    This function measures various metrics on a dataloader:
    - Loss
    - Average Position
    - MRR (Mean Reciprocal Rank)
    - Recall@1, Recall@5, Recall@10
    - nDCG@1, nDCG@5, nDCG@10
    """
    model.eval()
    loss_tot = 0
    recalls = {1: 0, 5: 0, 10: 0}
    ndcgs = {1: 0, 5: 0, 10: 0}

    images_embeddings = torch.tensor([]).to(device)
    texts_embeddings = torch.tensor([]).to(device)

    with torch.no_grad():
        for sample in tqdm(dataloader):

            inputs = sample

            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            pixel_values = inputs['pixel_values'].to(device)

            # Generate model outputs
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
            logits_per_image = outputs.logits_per_image  # Shape: [batch_size, num_images]
            logits_per_text = outputs.logits_per_text    # Shape: [batch_size, num_texts]

            # Compute the embeddings
            image_embeddings = outputs.image_embeds
            text_embeddings = outputs.text_embeds

           # Normalize
            image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
            text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)

            # Ground truth for this batch
            batch_size = logits_per_image.size(0)
            targets = torch.arange(batch_size).to(device)  # Correct index corresponds to diagonal

            # Calculate loss
            texts_loss = criterion(logits_per_text, targets)
            images_loss = criterion(logits_per_image, targets)
            loss = (images_loss + texts_loss) / 2.0
            loss_tot += loss.item() * batch_size

            # Add the embeddings to the list
            images_embeddings = torch.cat((images_embeddings, image_embeddings), 0)
            texts_embeddings = torch.cat((texts_embeddings, text_embeddings), 0)

    # Compute the loss
    loss_per_pair = loss_tot/len(dataset) # Average loss per pair

    # Compute the cosine similarity
    similarities = images_embeddings @ texts_embeddings.T
    similarities = similarities.cpu().numpy()

    # Compute the positions
    average_position, positions = get_average_position(similarities)
    mrr = get_MRR(similarities)
    recalls[1] = get_recall_at_k(similarities, 1)
    recalls[5] = get_recall_at_k(similarities, 5)
    recalls[10] = get_recall_at_k(similarities, 10)
    ndcgs[1] = get_nDCG_at_k(similarities, 1)
    ndcgs[5] = get_nDCG_at_k(similarities, 5)
    ndcgs[10] = get_nDCG_at_k(similarities, 10)

    return [loss_per_pair, average_position, mrr, recalls[1], recalls[5], recalls[10], ndcgs[1], ndcgs[5], ndcgs[10]]

In [None]:
def run_benchmark(results, lang, fold):
    print(f"Running fold #{fold} benchmark on {lang}...")

    measures = benchmark_on_dataloader(model, DATASETS[lang], DATALOADERS[lang], device)
    results.loc[len(results)] = [lang, fold] + measures

    return results

In [None]:
results = pd.DataFrame(columns=["lang", "fold#", "loss", "average_position", "mrr", "recall@1", "recall@5", "recall@10", "nDCG@1", "nDCG@5", "nDCG@10"])
nb_folds = 3
for lang in DATASETS.keys():
    for fold in range(nb_folds):
        results = run_benchmark(results, lang, fold)

Running benchmark on en...


100%|██████████| 117/117 [00:52<00:00,  2.21it/s]


Running benchmark on fr...


100%|██████████| 117/117 [00:44<00:00,  2.64it/s]


Running benchmark on nl...


100%|██████████| 117/117 [00:44<00:00,  2.63it/s]


In [None]:
results.to_csv(results_path + f"exploded_{model_name}.csv", index=False)
print(results_path + f"exploded_{model_name}.csv")
results

/content/drive/MyDrive/MASTER_THESIS/IconClassTestingScriptAndResults/results/res_art-large.csv


Unnamed: 0,lang,loss,average_position,mrr,recall@1,recall@5,recall@10,nDCG@1,nDCG@5,nDCG@10
0,en,1.49779,129.376142,0.176244,0.10317,0.235895,0.327781,0.10317,0.170296,0.199598
1,fr,1.831241,168.414293,0.122019,0.059645,0.167114,0.239656,0.059645,0.115087,0.138291
2,nl,2.144728,216.161204,0.08829,0.040301,0.119291,0.179473,0.040301,0.079834,0.099152
