In [39]:
import os 
os.environ["http_proxy"] = "http://devproxy.bloomberg.com:82"
os.environ["https_proxy"] = "http://devproxy.bloomberg.com:82"

In [75]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
%autoreload 2

import sys
from munch import Munch

PROJECT_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/'
CONFIG_PATH = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/config/development_local_f30k.yaml'

sys.path.append(PROJECT_PATH)

with open(CONFIG_PATH, 'rb') as f:
    config = Munch.fromYAML(f)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
from src.data.dataset import Dataset
from src.utils.dataset_preprocessing import load_json_annotations

json_file = load_json_annotations(config=config)

f30k_test_split = Dataset(
    config=config,
    split='test',
    json_file=json_file)

Loaded annotations from  /Users/mhendriksen/Desktop/repositories/datasets/f30k/annotations/dataset_flickr30k.json


In [106]:
from sentence_transformers import util
from PIL import Image
import glob
import torch
import pickle
import zipfile
from IPython.display import display
from IPython.display import Image as IPImage
import os
from tqdm.autonotebook import tqdm
from munch import Munch
torch.set_num_threads(4)

from src.models.encoders.clip import CLIP

# load the model
model = CLIP(config=config)

<class 'src.models.encoders.clip.CLIP'>


# Text-to-image

In [43]:
# load precomputed image embeddings
f30k_img_filenames, f30k_img_emb = model.compute_image_embeddings()

Image embeddigns are already precomputed
Loaded precomputed filenames and embeddings from  /Users/mhendriksen/Desktop/repositories/datasets/f30k/f30k-img-embeddings.pkl


In [44]:
from src.retrieval.retriever import Retriever
from src.metrics.recall_at_k import recall_at_k
from src.models.relevance_estimators.clip_based import RelevanceEstimator
from src.metrics.dcg import DCG

rel_estimator = RelevanceEstimator(config=config, dataset=f30k_test_split)
retriever = Retriever(config=config, model=model)
dcg = DCG(config=config, rel_estimator=rel_estimator)

t2i_queries = []
t2i_targets = []
t2i_retrieved_documents = []
t2i_scores = []
t2i_recalls_at_1 = []
t2i_recalls_at_5 = []
t2i_recalls_at_10 = []
t2i_dcgs = []

print('Text to image evaluation...')
for datapoint in f30k_test_split:
    # get textual query and target
    query = datapoint[0]
    target_filename = datapoint[4]

    retrieved_documents, scores = retriever.retrieve_top_k(
        query=query,
        documents=f30k_img_emb,
        documents_names=f30k_img_filenames,
        k=10
        )
    
    # metrics:
    # compute recall at k
    # t2i recall: there is only one correct item in the collection, i.e., total_in_collection=1
    t2i_recall_at_1 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=1, total_in_collection=1)
    t2i_recall_at_5 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=5, total_in_collection=1)
    t2i_recall_at_10 = recall_at_k(target_filename=target_filename, retrieved_documents=retrieved_documents, k=10, total_in_collection=1)
    # print('t2i: recalls at 1, 5, 10: ', t2i_recall_at_1, t2i_recall_at_5, t2i_recall_at_10)

    t2i_dcg = dcg.compute_dcg(query=query, target_filename=target_filename, retrieved_documents=retrieved_documents)
    # print('T2i_dcg: ', t2i_dcg)

    t2i_queries.append(query)
    t2i_targets.append(target_filename)
    t2i_retrieved_documents.append(retrieved_documents)
    t2i_scores.append(scores)
    t2i_recalls_at_1.append(t2i_recall_at_1)
    t2i_recalls_at_5.append(t2i_recall_at_5)
    t2i_recalls_at_10.append(t2i_recall_at_10)
    t2i_dcgs.append(t2i_dcg)

    if datapoint[-1] > 0 and datapoint[-1] % 100 == 0:
        print(f'Progress: {datapoint[-1]}/{len(f30k_test_split)}')


Text to image evaluation...


KeyboardInterrupt: 

In [None]:
data={
        't2i_queries': t2i_queries,
        't2i_targets': t2i_targets,
        't2i_retrieved_documents': t2i_retrieved_documents,
        't2i_scores': t2i_scores,
        't2i_recalls_at_1': t2i_recalls_at_1,
        't2i_recalls_at_5': t2i_recalls_at_5,
        't2i_recalls_at_10': t2i_recalls_at_10,
        't2i_dcgs': t2i_dcgs
        }

import pandas as pd

t2i_results = pd.DataFrame(
    data=data
)

t2i_results.describe()

Unnamed: 0,t2i_recalls_at_1,t2i_recalls_at_5,t2i_recalls_at_10,t2i_dcgs
count,5000.0,5000.0,5000.0,5000.0
mean,0.193,0.3974,0.4922,1.702666
std,0.394692,0.489409,0.499989,0.266153
min,0.0,0.0,0.0,1.0621
25%,0.0,0.0,0.0,1.486975
50%,0.0,0.0,0.0,1.6168
75%,0.0,1.0,1.0,1.90115
max,1.0,1.0,1.0,2.3179


In [None]:
from src.utils.dataset_preprocessing import save_results_dataframe

save_results_dataframe(config=config, dataf=t2i_results, filename='f30k-t2i-results')

Saved dataframe to  /Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/results/f30k-t2i-results.pkl


In [None]:
f30k_img_emb

# Image-to-text

In [102]:
f30k_capt_ids, f30k_capts, f30k_capt_embs = model.compute_caption_embeddings(ds_split=f30k_test_split)

Caption embeddigns are already precomputed
Loaded precomputed filenames and embeddings from  /Users/mhendriksen/Desktop/repositories/datasets/f30k/f30k-capt-embeddings.pkl


In [103]:
f30k_capt_ids[0]

125

In [104]:
f30k_capts[0]

'The man with pierced ears is wearing glasses and an orange hat.'

In [105]:
f30k_capt_ids[0]

125

In [101]:
from src.utils.dataset_preprocessing import get_precomputed_embeddings_path, dump_filenames_embs_to_pkl

emb_path = get_precomputed_embeddings_path(config=config, dtype='capt')

dump_filenames_embs_to_pkl(emb_file_path=emb_path,
                           data=(f30k_capt_ids, f30k_capts, f30k_capt_embs))

Saved files to  /Users/mhendriksen/Desktop/repositories/datasets/f30k/f30k-capt-embeddings.pkl


In [None]:
from src.utils.dataset_preprocessing import get_precomputed_embeddings_path, dump_filenames_embs_to_pkl, load_filenames_embs_from_pkl

capt_emb_filename = get_precomputed_embeddings_path(config=config, dtype='capt')
caption_data_precomputed = (f30k_capt_ids, f30k_capts, f30k_capt_embs)

if not os.path.exists(img_emb_filename):
    dump_filenames_embs_to_pkl(emb_file_path=img_emb_filename, data=caption_data_precomputed)

9242

In [None]:
import pickle

path = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/results/coco-i2t-results.pkl'

with open(path, 'rb') as f:
    data = pickle.load(f)

data.shape

(25010, 8)

In [None]:
from src.retrieval.retriever import Retriever
from src.metrics.recall_at_k import recall_at_k
from src.metrics.dcg import DCG

retriever = Retriever(config=config, model=model)
dcg = DCG(config=config, rel_estimator=rel_estimator)

i2t_queries = []
i2t_targets = []
i2t_retrieved_documents = []
i2t_scores = []
i2t_recalls_at_1 = []
i2t_recalls_at_5 = []
i2t_recalls_at_10 = []
i2t_dcgs = []

print('Image to text evaluation...')
for datapoint in f30k_test_split:
    # get textual query and target
    query = datapoint[1]
    target_filename = datapoint[3]


    retrieved_caption_ids, scores = retriever.retrieve_top_k(
        query=query,
        documents=f30k_capt_embs,
        documents_names=f30k_capt_ids,
        k=10
        )

    associated_img_ids = [f30k_test_split.captions[capt_id]['imgid'] for capt_id in retrieved_caption_ids]
    # print('retrieved_caption_ids: ', retrieved_caption_ids)
    # print('associated_img_ids: ', associated_img_ids)
    
    # metrics:
    # compute recall at k
    # i2t recall: there is only one correct item in the collection, i.e., total_in_collection=1
    i2t_recall_at_1 = recall_at_k(target_filename=target_filename, retrieved_documents=associated_img_ids, k=1, total_in_collection=5)
    i2t_recall_at_5 = recall_at_k(target_filename=target_filename, retrieved_documents=associated_img_ids, k=5, total_in_collection=5)
    i2t_recall_at_10 = recall_at_k(target_filename=target_filename, retrieved_documents=associated_img_ids, k=10, total_in_collection=5)
    # print('i2t: recalls at 1, 5, 10: ', i2t_recall_at_1, i2t_recall_at_5, i2t_recall_at_10)

    i2t_dcg = dcg.compute_dcg(query=query, target_filename=target_filename, retrieved_documents=associated_img_ids, caption_ids=retrieved_caption_ids)
    # print('i2t_dcg: ', i2t_dcg)

    i2t_queries.append(query)
    i2t_targets.append(target_filename)
    i2t_retrieved_documents.append(retrieved_documents)
    i2t_scores.append(scores)
    i2t_recalls_at_1.append(i2t_recall_at_1)
    i2t_recalls_at_5.append(i2t_recall_at_5)
    i2t_recalls_at_10.append(i2t_recall_at_10)
    i2t_dcgs.append(i2t_dcg)

    if datapoint[-1] > 0 and datapoint[-1] % 100 == 0:
        print(f'Progress: {datapoint[-1]}/{len(f30k_test_split)}')

Image to text evaluation...
Progress: 100/5000
Progress: 200/5000
Progress: 300/5000
Progress: 400/5000
Progress: 500/5000
Progress: 600/5000
Progress: 700/5000
Progress: 800/5000
Progress: 900/5000
Progress: 1000/5000
Progress: 1100/5000
Progress: 1200/5000
Progress: 1300/5000
Progress: 1400/5000
Progress: 1500/5000
Progress: 1600/5000
Progress: 1700/5000
Progress: 1800/5000
Progress: 1900/5000
Progress: 2000/5000
Progress: 2100/5000
Progress: 2200/5000
Progress: 2300/5000
Progress: 2400/5000
Progress: 2500/5000
Progress: 2600/5000
Progress: 2700/5000
Progress: 2800/5000
Progress: 2900/5000
Progress: 3000/5000
Progress: 3100/5000
Progress: 3200/5000
Progress: 3300/5000
Progress: 3400/5000
Progress: 3500/5000
Progress: 3600/5000
Progress: 3700/5000
Progress: 3800/5000
Progress: 3900/5000
Progress: 4000/5000
Progress: 4100/5000
Progress: 4200/5000
Progress: 4300/5000
Progress: 4400/5000
Progress: 4500/5000
Progress: 4600/5000
Progress: 4700/5000
Progress: 4800/5000
Progress: 4900/5000


In [None]:
data={
        'i2t_queries': i2t_targets,
        'i2t_targets': i2t_targets,
        'i2t_retrieved_documents': i2t_retrieved_documents,
        'i2t_scores': i2t_scores,
        'i2t_recalls_at_1': i2t_recalls_at_1,
        'i2t_recalls_at_5': i2t_recalls_at_5,
        'i2t_recalls_at_10': i2t_recalls_at_10,
        'i2t_dcgs': i2t_dcgs
        }


import pandas as pd

i2t_results = pd.DataFrame(
    data=data
)

i2t_results

Unnamed: 0,i2t_queries,i2t_targets,i2t_retrieved_documents,i2t_scores,i2t_recalls_at_1,i2t_recalls_at_5,i2t_recalls_at_10,i2t_dcgs
0,25,25,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3575, 0.3318, 0.3113, 0.297, 0.2944, 0.2887...",1.0,1.0,1.0,1.3534
1,25,25,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3575, 0.3318, 0.3113, 0.297, 0.2944, 0.2887...",1.0,1.0,1.0,1.3534
2,25,25,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3575, 0.3318, 0.3113, 0.297, 0.2944, 0.2887...",1.0,1.0,1.0,1.3534
3,25,25,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3575, 0.3318, 0.3113, 0.297, 0.2944, 0.2887...",1.0,1.0,1.0,1.3534
4,25,25,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3575, 0.3318, 0.3113, 0.297, 0.2944, 0.2887...",1.0,1.0,1.0,1.3534
...,...,...,...,...,...,...,...,...
4995,30943,30943,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3198, 0.3172, 0.3064, 0.3049, 0.299, 0.2976...",1.0,0.6,0.6,-0.2817
4996,30943,30943,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3198, 0.3172, 0.3064, 0.3049, 0.299, 0.2976...",1.0,0.6,0.6,-0.2817
4997,30943,30943,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3198, 0.3172, 0.3064, 0.3049, 0.299, 0.2976...",1.0,0.6,0.6,-0.2817
4998,30943,30943,"[2149968397.jpg, 90168112.jpg, 3457856049.jpg,...","[0.3198, 0.3172, 0.3064, 0.3049, 0.299, 0.2976...",1.0,0.6,0.6,-0.2817


In [None]:
print(len(t2i_queries))
print(len(t2i_targets))
print(len(t2i_retrieved_documents))
print(len(t2i_scores))
print(len(t2i_recalls_at_1))
print(len(t2i_recalls_at_5))
print(len(t2i_recalls_at_10))
print(len(t2i_dcgs))

11
11
11
11
11
11
11
11


In [None]:
import boto3

endpoint_url='http://s3.dev.obdc.bcs.bloomberg.com'
aws_access_key_id='J8I524PGEG4KVPV14HM7'
aws_secret_access_key='jhfJZBCQR6GHUq4VlImx8gOeAW3BA5wnSL08lqBJ'

client = boto3.client(
      's3',
      aws_access_key_id = aws_access_key_id,
      aws_secret_access_key = aws_secret_access_key,
      endpoint_url = endpoint_url
      )

In [48]:
client.upload_file('./test.txt', 'mariya', 'coco/test.txt')

In [60]:
import os

subset = []
prefix = './test'
for el in client.list_objects(Bucket='mariya')['Contents']:
    # print(el['Key'])
    # print(type(el['Key']))
    if el['Key'].startswith(prefix):
        subset.append(el)
        # client.download_file('mariya', el['Key'], el['Key'])
subset


[]

In [63]:
path = '/Users/mhendriksen/Desktop/repositories/evaluating-cmr-in-mm/results/coco-t2i-results.pkl'

with open(path, 'rb') as f:
    coco_t2i_results = pickle.load(f)

In [71]:
import PIL

PIL.Image.Image

PIL.Image.Image

In [72]:
which python

SyntaxError: invalid syntax (1722139382.py, line 1)