In [1]:
from pathlib import Path
import os
from joblib import load
import sys
from datasets import load_from_disk
from sklearn.metrics.pairwise import cosine_similarity
import json
from transformers import CLIPProcessor, CLIPModel
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(str(Path.cwd().parent))
from src.utils.dirutils import get_data_dir, get_models_dir

In [3]:
gt_embeddings = load(get_data_dir() / "interim" / "clip" / "dataset_embeddings.joblib")

In [4]:
dataset = load_from_disk(get_data_dir() / "processed" / "captioning_dataset_augmented")["test"]

In [5]:
test_set_filenames = set([example["file_name"] for example in dataset])

In [6]:
gt_embeddings = {k: v for k, v in gt_embeddings.items() if k in test_set_filenames}

In [7]:
tot_score = 0.0
for _, embeddings in gt_embeddings.items():
    tot_score += cosine_similarity(
        [embeddings["img_embedding"]], [embeddings["caption_embedding"]]
    )[0][0]
avg_score = tot_score / len(gt_embeddings)
print(avg_score)

0.25814173782307703


In [8]:
model_names = ("microsoft-git-base-good-samples",)
outputs = {}

for model_name in model_names:
    with open(get_models_dir() / "captioning" / model_name / "test_outputs_flat.json") as f:
        outputs[model_name] = json.load(f)      

In [9]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip.train(False)
avg_scores = {
    'blip-2': 0.28124519233363415,
    'microsoft-git-base': 0.2693781059735237,
    'microsoft-git-base-good-samples': 0.2787208376248433,
    'microsoft-git-base-no-weights': 0.2661253812732238,
    'microsoft-git-base-frozen': 0.26104122709153155
}
for model_name in model_names:
    tot_score = 0.0
    idx = 0
    x = 0
    for key, value in tqdm(outputs[model_name].items()):
        with torch.no_grad():
            inputs = processor(
                text=[value], images=None, return_tensors="pt", padding=True
            )

            text_features = clip.get_text_features(**inputs).detach().numpy()
            embedding = text_features[0]
            tot_score += cosine_similarity(
                [embedding], [gt_embeddings[key]["img_embedding"]]
            )[0][0]
            x += 1
    avg_score = tot_score / x
    avg_scores[model_name] = avg_score

100%|██████████| 17696/17696 [11:22<00:00, 25.92it/s]


In [11]:
avg_scores

{'blip-2': 0.28124519233363415,
 'microsoft-git-base': 0.2693781059735237,
 'microsoft-git-base-good-samples': 0.2787208376248433,
 'microsoft-git-base-no-weights': 0.2661253812732238,
 'microsoft-git-base-frozen': 0.26104122709153155}

In [12]:
len(gt_embeddings)

17696