# Imports

In [1]:
import numpy as np

In [2]:
import os.path as osp
import numpy as np
from tqdm import tqdm
from utils import pickle_save, pickle_load
from pprint import pprint
from utils.data.delf import datum_io
from copy import deepcopy

In [3]:
import sacred
from sacred import SETTINGS
from sacred.utils import apply_backspaces_and_linefeeds
from numpy import linalg as LA
from utils import pickle_load, pickle_save
from utils.revisited import compute_metrics
from utils.data.delf import datum_io

In [4]:
help(compute_metrics)

Help on function compute_metrics in module utils.revisited:

compute_metrics(dataset, ranks, gnd, sizes=[], kappas=[1, 5, 10])



In [5]:
ex = sacred.Experiment('Prepare Top-K (VIQUAE FOR RTT)', interactive=True)
# Filter backspaces and linefeeds
SETTINGS.CAPTURE_MODE = 'sys'
ex.captured_out_filter = apply_backspaces_and_linefeeds

In [6]:
feature_name = 'r50_gldv1'
set_name = 'tuto'
gnd_name = 'gnd_'+ set_name+'.pkl'

In [7]:
dataset_name = 'viquae_for_rrt'
data_dir = osp.join('/mnt/beegfs/home/smessoud/RerankingTransformer/models/research/delf/delf/python/delg/data', dataset_name)

In [8]:
training = False
use_aqe = False
aqe_params = {'k': 2, 'alpha': 0.3}

save_nn_inds = True

In [9]:
query_file     = set_name+'_query.txt'
gallery_file   = set_name+'_gallery.txt'
selection_file = set_name+'_selection.txt'

In [10]:
with open(osp.join(data_dir, query_file)) as fid:
    query_lines   = fid.read().splitlines()

In [11]:
len(query_lines)

100

In [12]:
query_feats = []
for i in tqdm(range(len(query_lines))):
    name = osp.splitext(osp.basename(query_lines[i].split(';;')[0]))[0]
    path = osp.join(data_dir, 'delg_' + feature_name, name + '.delg_global')
    query_feats.append(datum_io.ReadFromFile(path))

100%|██████████| 100/100 [00:00<00:00, 660.39it/s]


In [13]:
query_feats = np.stack(query_feats, axis=0)
query_feats = query_feats / LA.norm(query_feats, axis=-1)[:, None]

In [14]:
query_feats.shape

(100, 2048)

In [15]:
with open(osp.join(data_dir, selection_file)) as fid:
    selection_lines = fid.read().splitlines()

In [16]:
np.stack(selection_lines, axis=0).shape

(4838,)

In [17]:
index_feats = []
for i in tqdm(range(len(selection_lines))):
    name = osp.splitext(osp.basename(selection_lines[i].split(';;')[0]))[0]
    path = osp.join(data_dir, 'delg_'+feature_name, name+'.delg_global')
    index_feats.append(datum_io.ReadFromFile(path))

100%|██████████| 4838/4838 [00:09<00:00, 517.33it/s]


In [18]:
selection_index_feats = np.zeros((query_feats.shape[0], 100, query_feats.shape[1]))
selection_index_feats.shape

(100, 100, 2048)

In [19]:
gnd_data = pickle_load(osp.join(data_dir, gnd_name))

In [20]:
selection_index_sizes = [len(gnd_data['simlist'][i]) for i in range(len(gnd_data['simlist']))]
np.sum(selection_index_sizes)

4838

In [21]:
size = 0
counter = 0
for i in range(selection_index_feats.shape[0]):
    for j in range(selection_index_feats.shape[1]):
        if j < selection_index_sizes[i]:
            selection_index_feats[i][j] = index_feats[counter]
            counter += 1

In [22]:
selection_index_feats.shape

(100, 100, 2048)

In [23]:
for i in range(selection_index_feats.shape[0]):
    selection_index_feats[i] = selection_index_feats[i]/LA.norm(selection_index_feats[i], axis=-1)[:,None]

  


In [24]:
sims = []
for i in range(len(selection_index_feats)):
    index_feats = np.stack(selection_index_feats[i], axis=0)
    sims.append(np.matmul(query_feats[i], index_feats.T))

In [25]:
sims = np.stack(sims, axis=0)
sims.shape

(100, 100)

In [26]:
sims

array([[0.16223355, 0.16238563, 0.18141674, ...,        nan,        nan,
               nan],
       [0.17460734, 0.44644287, 0.3944457 , ...,        nan,        nan,
               nan],
       [0.25068201, 0.05434434, 0.44644287, ...,        nan,        nan,
               nan],
       ...,
       [0.0691188 , 0.14343122, 0.13620838, ...,        nan,        nan,
               nan],
       [0.21718944, 0.12931053, 0.14343122, ...,        nan,        nan,
               nan],
       [0.14343122, 0.0806773 , 0.21771399, ...,        nan,        nan,
               nan]])

In [27]:
np.count_nonzero(np.isnan(sims)), np.count_nonzero(~np.isnan(sims))

(5162, 4838)

In [28]:
if use_aqe:
    alpha = aqe_params['alpha']
    nn_inds = np.argsort(-sims, -1)
    query_aug = deepcopy(query_feats)
    for i in range(len(query_feats)):
        new_q = [query_feats[i]]
        for j in range(aqe_params['k']):
            nn_id = nn_inds[i, j]
            weight = sims[i, nn_id] ** aqe_params['alpha']
            new_q.append(weight * index_feats[nn_id])
        new_q = np.stack(new_q, 0)
        new_q = np.mean(new_q, axis=0)
        query_aug[i] = new_q/LA.norm(new_q, axis=-1)
    sims = np.matmul(query_aug, index_feats.T)

In [29]:
selection_index_feats[0].shape

(100, 2048)

In [30]:
nn_inds = np.argsort(-sims, -1)
nn_dists = deepcopy(sims)
for i in range(query_feats.shape[0]):
    index_feats = selection_index_feats[i]
    for j in range(index_feats.shape[0]):
        nn_dists[i, j] = sims[i, nn_inds[i, j]]

In [31]:
nn_inds.shape

(100, 100)

In [32]:
nn_dists

array([[0.53691091, 0.48823032, 0.3786902 , ...,        nan,        nan,
               nan],
       [0.50516481, 0.46841804, 0.44644287, ...,        nan,        nan,
               nan],
       [0.50516481, 0.46841804, 0.45547155, ...,        nan,        nan,
               nan],
       ...,
       [0.24880777, 0.19662176, 0.19087987, ...,        nan,        nan,
               nan],
       [0.27114646, 0.24529736, 0.21771399, ...,        nan,        nan,
               nan],
       [0.26374199, 0.24880777, 0.22943023, ...,        nan,        nan,
               nan]])

In [33]:
for i in range(len(selection_index_sizes)):
    size = selection_index_sizes[i]
    if max(nn_inds[i][:size]) > size:
        print('Ewwwwwwwwwwwww')

In [34]:
if save_nn_inds:
    if use_aqe:
        output_file = set_name +'_aqe_nn_inds_%s.pkl' % feature_name
    else:
        output_file = set_name + '_nn_inds_%s.pkl' % feature_name

    if training:
        output_file = 'training_'+ output_file

    output_path = osp.join(data_dir, output_file)
    pickle_save(output_path, nn_inds)

In [35]:
from utils.revisited import compute_metrics

In [36]:
gnd_data = pickle_load(osp.join(data_dir, gnd_name))
compute_metrics('viquae', nn_inds.T, gnd_data['gnd'], selection_index_sizes, kappas=[1,5,10])

metrics:  ['hit_rate', 'mrr', 'precision']
m_list:  ['hit_rate', 'mrr', 'precision', 'hit_rate@1', 'mrr@1', 'precision@1', 'hit_rate@5', 'mrr@5', 'precision@5', 'hit_rate@10', 'mrr@10', 'precision@10']
ranks shape:  (100, 100)
starting qrels_dict
starting run_dict
starting evaluate
starting rounding
{'hit_rate': 99.0, 'mrr': 41.89, 'precision': 14.29, 'hit_rate@1': 30.0, 'mrr@1': 30.0, 'precision@1': 30.0, 'hit_rate@5': 53.0, 'mrr@5': 38.42, 'precision@5': 18.0, 'hit_rate@10': 67.0, 'mrr@10': 40.3, 'precision@10': 17.0}


{'hit_rate': 99.0,
 'mrr': 41.89,
 'precision': 14.29,
 'hit_rate@1': 30.0,
 'mrr@1': 30.0,
 'precision@1': 30.0,
 'hit_rate@5': 53.0,
 'mrr@5': 38.42,
 'precision@5': 18.0,
 'hit_rate@10': 67.0,
 'mrr@10': 40.3,
 'precision@10': 17.0}

In [37]:
from utils.revisited import compute_ap, compute_map
from ranx import Qrels, Run, evaluate

In [38]:
def compute_metrics(dataset, ranks, gnd, sizes=[], kappas=[1, 5, 10]):
    # old evaluation protocol
    if dataset.startswith('classic'):
        map, aps, _, _ = compute_map(ranks, gnd)
        out = {'map': np.around(map*100, decimals=3)}
        print('>> {}: mAP {:.2f}'.format(dataset, out['map']))

    # new evaluation protocol for viquae dataset
    elif dataset.startswith('viquae'):
        metrics = ["map", "mrr", "precision", "hit_rate", "recall"]
        m_list = [metric for metric in metrics]

        for i in range(len(kappas)):
            m_list.extend([metric+'@'+str(kappas[i]) for metric in metrics])
        
        qrels_dict = {}
        run_dict = {}
        
        for i in range(ranks.T.shape[0]):
            size = sizes[i]
            q_str = "q_"+str(int(i))
            qrels_dict[q_str] = dict([('d_' + str(int(key)), 1) for key in np.concatenate([gnd[i]['r_easy'], gnd[i]['r_hard']]) ])
            run_dict[q_str]   = dict([('d_' + str(int(key)), 1) for key in ranks[:size,i]])

        qrels = Qrels(qrels_dict)
        run = Run(run_dict)
        out = evaluate(qrels, run, m_list)
        for key, value in out.items():
            out[key] = np.around(value*100, decimals=2)
    print(out)
    
    return out

In [39]:
kappas=[1,5,10]
ranks, gnd, sizes = nn_inds.T, gnd_data['gnd'], selection_index_sizes

metrics = ["map", "mrr", "precision", "hit_rate", "recall"]
m_list = [metric for metric in metrics]

for i in range(len(kappas)):
    m_list.extend([metric+'@'+str(kappas[i]) for metric in metrics])

qrels_dict = {}
run_dict = {}

for i in range(ranks.T.shape[0]):
    size = sizes[i]
    q_str = "q_"+str(int(i))
    ok_inds = np.concatenate([gnd[i]['r_easy'], gnd[i]['r_hard']])
    
    if len(ok_inds) == 0:
        qrels_dict[q_str] = {"DUMMY_RUN": 0}
    else:
        qrels_dict[q_str] = dict([('d_' + str(int(key)), 1) for key in ok_inds])
    run_dict[q_str]   = dict([('d_' + str(int(key)), 1) for key in ranks[:size,i]])

qrels = Qrels(qrels_dict)
run = Run(run_dict)
out = evaluate(qrels, run, m_list)
for key, value in out.items():
    out[key] = np.around(value*100, decimals=2)
print(out)

{'map': 29.13, 'mrr': 41.89, 'precision': 14.29, 'hit_rate': 99.0, 'recall': 99.0, 'map@1': 8.83, 'mrr@1': 30.0, 'precision@1': 30.0, 'hit_rate@1': 30.0, 'recall@1': 8.83, 'map@5': 15.27, 'mrr@5': 38.42, 'precision@5': 18.0, 'hit_rate@5': 53.0, 'recall@5': 22.82, 'map@10': 19.49, 'mrr@10': 40.3, 'precision@10': 17.0, 'hit_rate@10': 67.0, 'recall@10': 36.97}


In [40]:
run_dict
[k for k,v in run_dict.items() if v == {}]

[]

In [41]:
gnd_data = pickle_load(osp.join(data_dir, gnd_name))
compute_metrics('viquae', nn_inds.T, gnd_data['gnd'], selection_index_sizes, kappas=[1,5,10])

{'map': 29.13, 'mrr': 41.89, 'precision': 14.29, 'hit_rate': 99.0, 'recall': 99.0, 'map@1': 8.83, 'mrr@1': 30.0, 'precision@1': 30.0, 'hit_rate@1': 30.0, 'recall@1': 8.83, 'map@5': 15.27, 'mrr@5': 38.42, 'precision@5': 18.0, 'hit_rate@5': 53.0, 'recall@5': 22.82, 'map@10': 19.49, 'mrr@10': 40.3, 'precision@10': 17.0, 'hit_rate@10': 67.0, 'recall@10': 36.97}


{'map': 29.13,
 'mrr': 41.89,
 'precision': 14.29,
 'hit_rate': 99.0,
 'recall': 99.0,
 'map@1': 8.83,
 'mrr@1': 30.0,
 'precision@1': 30.0,
 'hit_rate@1': 30.0,
 'recall@1': 8.83,
 'map@5': 15.27,
 'mrr@5': 38.42,
 'precision@5': 18.0,
 'hit_rate@5': 53.0,
 'recall@5': 22.82,
 'map@10': 19.49,
 'mrr@10': 40.3,
 'precision@10': 17.0,
 'hit_rate@10': 67.0,
 'recall@10': 36.97}