In [45]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import sys
from munch import Munch

PROJECT_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/'
CONFIG_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/config/development_local_coco.yaml'

sys.path.append(PROJECT_PATH)

with open(CONFIG_PATH, 'rb') as f:
    config = Munch.fromYAML(f)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import sys
from munch import Munch

PROJECT_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/'
CONFIG_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/config/development_local_coco.yaml'

sys.path.append(PROJECT_PATH)

with open(CONFIG_PATH, 'rb') as f:
    config = Munch.fromYAML(f)

In [3]:
from src.data.dataset import Dataset
from src.utils.dataset_preprocessing import load_json_annotations

json_file = load_json_annotations(config=config)

coco_test_split = Dataset(
    config=config,
    split='test',
    json_file=json_file)

Loaded annotations from  /Users/mhendriksen/Desktop/repositories/datasets/coco/annotations/dataset_coco.json


In [4]:
from sentence_transformers import util
from PIL import Image
import glob
import torch
import pickle
import zipfile
from IPython.display import display
from IPython.display import Image as IPImage
import os
from tqdm.autonotebook import tqdm
from munch import Munch
torch.set_num_threads(4)

from src.models.encoders.clip import CLIP

# load the model
model = CLIP(config=config)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# load precomputed image embeddings
img_filenames, img_emb = model.compute_image_embeddings()

Image embeddigns are already precomputed
Loaded precomputed filenames and embeddings from  /Users/mhendriksen/Desktop/repositories/datasets/coco/coco-img-embeddings.pkl


In [91]:
from src.utils.dataset_preprocessing import get_precomputed_embeddings_path, dump_filenames_embs_to_pkl

img_emb_filename = get_precomputed_embeddings_path(config=config, dtype='img')
image_data_precomputed = (img_filenames, img_emb)


dump_filenames_embs_to_pkl(emb_file_path=img_emb_filename, data=image_data_precomputed)

Saved files to  /Users/mhendriksen/Desktop/repositories/datasets/coco/coco-img-embeddings.pkl


In [77]:
capt_ids, capts, capt_embs = model.compute_caption_embeddings(ds_split=coco_test_split)

Computing caption embeddings...


Batches: 100%|██████████| 196/196 [01:03<00:00,  3.06it/s]


In [92]:
from src.utils.dataset_preprocessing import get_precomputed_embeddings_path, dump_filenames_embs_to_pkl, load_filenames_embs_from_pkl

capt_emb_filename = get_precomputed_embeddings_path(config=config, dtype='capt')
caption_data_precomputed = (capt_ids, capts, capt_embs)

dump_filenames_embs_to_pkl(emb_file_path=capt_emb_filename, data=caption_data_precomputed)

Saved files to  /Users/mhendriksen/Desktop/repositories/datasets/coco/coco-capt-embeddings.pkl


In [95]:
from src.retrieval.retriever import Retriever
from src.metrics.recall_at_k import recall_at_k
from src.models.relevance_estimators.clip_based import RelevanceEstimator
from src.metrics.dcg import DCG

rel_estimator = RelevanceEstimator(config=config, dataset=coco_test_split)
retriever = Retriever(config=config, model=model)
dcg = DCG(config=config, rel_estimator=rel_estimator)

t2i_queries = []
t2i_targets = []
t2i_retrieved_documents = []
t2i_scores = []
t2i_recalls_at_1 = []
t2i_recalls_at_5 = []
t2i_recalls_at_10 = []
t2i_dcgs = []

print('Text to image evaluation...')
for datapoint in coco_test_split:
    # get textual query and target
    query = datapoint[0]
    target_filename = datapoint[4]

    retrieved_documents, scores = retriever.retrieve_top_k(
        query=query,
        documents=img_emb,
        documents_names=img_filenames,
        k=10
        )
    
    # metrics:
    # compute recall at k
    # t2i recall: there is only one correct item in the collection, i.e., total_in_collection=1
    t2i_recall_at_1 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=1, total_in_collection=1)
    t2i_recall_at_5 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=5, total_in_collection=1)
    t2i_recall_at_10 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=10, total_in_collection=1)
    # print('t2i: recalls at 1, 5, 10: ', t2i_recall_at_1, t2i_recall_at_5, t2i_recall_at_10)

    t2i_dcg = dcg.compute_dcg(query=query, target_filename=target_filename, retrieved_documents=retrieved_documents)
    # print('T2i_dcg: ', t2i_dcg)

    t2i_queries.append(query)
    t2i_targets.append(target_filename)
    t2i_retrieved_documents.append(retrieved_documents)
    t2i_scores.append(scores)
    t2i_recalls_at_1.append(t2i_recall_at_1)
    t2i_recalls_at_5.append(t2i_recall_at_5)
    t2i_recalls_at_10.append(t2i_recall_at_10)
    t2i_dcgs.append(t2i_dcg)

    if datapoint[-1] > 0 and datapoint[-1] % 100 == 0:
        print(f'Progress: {datapoint[-1]}/{len(coco_test_split)}')


Text to image evaluation...
Progress: 100/25010
Progress: 200/25010
Progress: 300/25010
Progress: 400/25010
Progress: 500/25010
Progress: 600/25010
Progress: 700/25010
Progress: 800/25010
Progress: 900/25010
Progress: 1000/25010
Progress: 1100/25010
Progress: 1200/25010
Progress: 1300/25010
Progress: 1400/25010
Progress: 1500/25010
Progress: 1600/25010
Progress: 1700/25010
Progress: 1800/25010
Progress: 1900/25010
Progress: 2000/25010
Progress: 2100/25010
Progress: 2200/25010
Progress: 2300/25010
Progress: 2400/25010
Progress: 2500/25010
Progress: 2600/25010
Progress: 2700/25010
Progress: 2800/25010
Progress: 2900/25010
Progress: 3000/25010
Progress: 3100/25010
Progress: 3200/25010
Progress: 3300/25010
Progress: 3400/25010
Progress: 3500/25010
Progress: 3600/25010
Progress: 3700/25010
Progress: 3800/25010
Progress: 3900/25010
Progress: 4000/25010
Progress: 4100/25010
Progress: 4200/25010
Progress: 4300/25010
Progress: 4400/25010
Progress: 4500/25010
Progress: 4600/25010
Progress: 4700/

: 

: 

In [None]:
data={
        't2i_queries': t2i_queries,
        't2i_targets': t2i_targets,
        't2i_retrieved_documents': t2i_retrieved_documents,
        't2i_scores': t2i_scores,
        't2i_recalls_at_1': t2i_recalls_at_1,
        't2i_recalls_at_5': t2i_recalls_at_5,
        't2i_recalls_at_10': t2i_recalls_at_10,
        't2i_dcgs': t2i_dcgs
        }

import pandas as pd

t2i_results = pd.DataFrame(
    data=data
)

print(t2i_results.describe())

In [None]:
from src.utils.dataset_preprocessing import save_results_dataframe

save_results_dataframe(config=config, dataf=t2i_results, filename='coco-t2i-results')