In [1]:
import json
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from concurrent import futures
from tqdm.notebook import tqdm
from pprint import pprint

from common import weight_add_result, QrelDataLoader, weight_add_result_org
from beir.retrieval.evaluation import EvaluateRetrieval

  from tqdm.autonotebook import tqdm


In [23]:
def load_dataset_and_bm25_dense_result(dataset, data_dir_root, result_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    
    bm25_result_path = os.path.join(result_dir_root, dataset, "result/bm25/analysis.json")
    dense_result_path = os.path.join(result_dir_root, dataset, "result/dot/mpnet-v3-mse-beir-dot/analysis.json")

    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    
    with open(dense_result_path) as f:
         dense_result = json.load(f)
            
    all_qids = qrels.keys()
    return (all_qids, qrels, bm25_result, dense_result)

def weight_add_result_per_dataset(weight, all_qids, qrels, bm25_result, dense_result):
    # k_values = [1, 10, 100]
    k_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    results = weight_add_result(bm25_result, dense_result, all_qids, weight)
    ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(qrels, results, k_values=k_values)
    # return ndcg["NDCG@10"]
    return ndcg

def weight_add_result_per_dataset_wrcap(weight, all_qids, qrels, bm25_result, dense_result):
    # k_values = [1, 10, 100]
    k_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    results = weight_add_result(bm25_result, dense_result, all_qids, weight)
    ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(qrels, results, k_values=k_values)
    rcap = EvaluateRetrieval("").evaluate_custom(qrels, results, k_values, metric="r_cap")
    return {"ndcg": ndcg, "recall": recall, "rcap": rcap}

def weight_add_org_result_per_dataset(weight,qrels, bm25_result, dense_result):
    # k_values = [1, 10, 100]
    k_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    results = weight_add_result_org(qrels, dense_result, bm25_result, weight)
    ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(qrels, results, k_values=k_values)
    # return ndcg["NDCG@10"]
    return ndcg

In [3]:
def load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    cbm25_result_path = os.path.join(result_dir_root, dataset, "result/lss/mpnet-tod/analysis.json")
    dense_result_path = os.path.join(result_dir_root, dataset, "result/dot/mpnet-v3-mse-beir-dot/analysis.json")

    with open(cbm25_result_path) as f:
        cbm25_result = list(json.load(f).values())[0]
    
    with open(dense_result_path) as f:
        dense_result = json.load(f)
            
    all_qids = qrels.keys()
    return (all_qids, qrels, cbm25_result, dense_result)

In [4]:
datasets = ["arguana", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco", "nfcorpus", "nq",
      "quora", "scidocs", "scifact", "trec-covid", "trec-robust04-title", "trec-robust04-desc", "webis-touche2020"]

data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/C-BM25/results/"

In [5]:
# all_bm25_result = {}
# all_dense_result = {}
# k_values = [1, 10, 100]
# for dataset in tqdm(datasets):
#     all_qids, qrels, bm25_result, dense_result = load_dataset_and_bm25_dense_result(dataset, data_dir_root, result_dir_root)
#     bm25_ndcg, _, _, _ = EvaluateRetrieval("").evaluate(qrels, bm25_result, k_values=k_values)
#     dense_ndcg, _, _, _ = EvaluateRetrieval("").evaluate(qrels, dense_result, k_values=k_values)
#     all_bm25_result[dataset] = bm25_ndcg["NDCG@10"]
#     all_dense_result[dataset] = dense_ndcg["NDCG@10"]
    
# pprint(all_bm25_result)
# pprint(all_dense_result)

In [7]:
k_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
bm25_ndcg = {}
cbm25_ndcg = {}
dense_ndcg = {}
for dataset in tqdm(datasets):
    if dataset in {"msmarco", "trec-robust04-title"}:
        continue
    all_qids, qrels, bm25_result, dense_result = load_dataset_and_bm25_dense_result(dataset, data_dir_root, result_dir_root)
    all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
    ndcg_bm25, map_bm25, recall_bm25, p_bm25 = EvaluateRetrieval("").evaluate(qrels, bm25_result, k_values=k_values)
    ndcg_d, map_d, recall_d, p_d = EvaluateRetrieval("").evaluate(qrels, dense_result, k_values=k_values)
    ndcg_cbm25, map_cbm25, recall_cbm25, p_cbm25 = EvaluateRetrieval("").evaluate(qrels, cbm25_result, k_values=k_values)
    bm25_ndcg[dataset] = ndcg_bm25
    cbm25_ndcg[dataset] = ndcg_cbm25
    dense_ndcg[dataset] = ndcg_d
    # bm25_ndcg[dataset] = ndcg_bm25["NDCG@10"]
    # cbm25_ndcg[dataset] = ndcg_cbm25["NDCG@10"]
    # dense_ndcg[dataset] = ndcg_d["NDCG@10"]

  0%|          | 0/16 [00:00<?, ?it/s]

In [18]:
def average_each_result_on_each_rank(results):
    prototype = defaultdict(list)
    for result in results.values():
        for k, v in result.items():
            prototype[k].append(v)
    average_results = {k: np.average(v) for k, v in prototype.items()}
    return average_results

In [22]:
print("bm25")
pprint(average_each_result_on_each_rank(bm25_ndcg))
print("cbm25")
pprint(average_each_result_on_each_rank(cbm25_ndcg))
print("dense")
pprint(average_each_result_on_each_rank(dense_ndcg))
# print("bm25: ", np.average(list(bm25_ndcg.values())))
# print("cbm25: ", np.average(list(cbm25_ndcg.values())))
# print("dense25: ", np.average(list(dense_ndcg.values())))

bm25
{'NDCG@1': 0.3957285714285715,
 'NDCG@10': 0.4102328571428572,
 'NDCG@100': 0.4344328571428571,
 'NDCG@20': 0.4147392857142857,
 'NDCG@30': 0.41938785714285715,
 'NDCG@40': 0.4227535714285714,
 'NDCG@50': 0.4253642857142857,
 'NDCG@60': 0.42771714285714285,
 'NDCG@70': 0.42982785714285715,
 'NDCG@80': 0.431725,
 'NDCG@90': 0.43306857142857147}
cbm25
{'NDCG@1': 0.4832328571428572,
 'NDCG@10': 0.4864685714285714,
 'NDCG@100': 0.4857678571428571,
 'NDCG@20': 0.48733857142857145,
 'NDCG@30': 0.48754357142857135,
 'NDCG@40': 0.48839,
 'NDCG@50': 0.48836571428571424,
 'NDCG@60': 0.48830642857142864,
 'NDCG@70': 0.4873864285714286,
 'NDCG@80': 0.4870607142857143,
 'NDCG@90': 0.4864792857142857}
dense
{'NDCG@1': 0.42644857142857145,
 'NDCG@10': 0.43979428571428575,
 'NDCG@100': 0.45929785714285715,
 'NDCG@20': 0.44646357142857146,
 'NDCG@30': 0.44864928571428564,
 'NDCG@40': 0.4511785714285715,
 'NDCG@50': 0.45264428571428583,
 'NDCG@60': 0.4540828571428571,
 'NDCG@70': 0.4556800000000001

In [None]:
weight_org_bm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, bm25_result, dense_result = load_dataset_and_bm25_dense_result(dataset, data_dir_root, result_dir_root)
    with futures.ProcessPoolExecutor(max_workers=2) as executer:
        furs = []
        for weight in range(11):
            weight *= 0.1
            furs.append((weight, executer.submit(weight_add_org_result_per_dataset, weight, qrels, bm25_result, dense_result)))
        for weight, future in furs:
            weight_2 = round(weight, 2)
            ndcg = future.result()
            # weight_org_bm25_dense_ndcg[name][weight_2][dataset] = ndcg
            for name, value in ndcg.items():
                if weight_2 not in weight_org_bm25_dense_ndcg[name]:
                    weight_org_bm25_dense_ndcg[name][weight_2] = {}
                weight_org_bm25_dense_ndcg[name][weight_2][dataset] = value
                
print(weight_org_bm25_dense_ndcg)

  0%|          | 0/16 [00:00<?, ?it/s]

In [19]:
with open("weight_sum_results/bm25_dense_ndcg_each_dataset_org.json", "w") as f:
    json.dump(weight_org_bm25_dense_ndcg, f)

In [20]:
all_weight_org_bm25_dense_ndcg = {}
for weight, ndcgs in weight_org_bm25_dense_ndcg.items():
    ndcgs = [value for key, value in ndcgs.items() if key not in {"msmarco", "trec-robust04-title"}]
    all_weight_org_bm25_dense_ndcg[weight] = np.round(np.mean(ndcgs), 4)
    
pprint(all_weight_org_bm25_dense_ndcg)

{0.0: 0.4398,
 0.1: 0.4693,
 0.2: 0.4857,
 0.3: 0.4957,
 0.4: 0.4975,
 0.5: 0.4959,
 0.6: 0.4902,
 0.7: 0.4827,
 0.8: 0.4671,
 0.9: 0.4435,
 1.0: 0.4092}


In [21]:
with open("weight_sum_results/bm25_dense_ndcg_ave-all_org.json", "w") as f:
    json.dump(all_weight_org_bm25_dense_ndcg, f)

In [22]:
weight_bm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, bm25_result, dense_result = load_dataset_and_bm25_dense_result(dataset, data_dir_root, result_dir_root)
#     with futures.ProcessPoolExecutor(max_workers=3) as executer:
#         furs = []
#         for weight in range(11):
#             weight *= 0.1
#             furs.append((weight, executer.submit(weight_add_result_per_dataset, weight, all_qids, qrels, bm25_result, dense_result)))
#         for weight, future in furs:
#             ndcg10 = future.result()
#             weight_bm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10

    for weight in range(11):
        weight *= 0.1
        ndcg10 = weight_add_result_per_dataset(weight, all_qids, qrels, bm25_result, dense_result)
        weight_bm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10
        
            
pprint(weight_bm25_dense_ndcg)

  0%|          | 0/16 [00:00<?, ?it/s]

defaultdict(<class 'dict'>,
            {0.0: {'arguana': 0.48661,
                   'climate-fever': 0.20619,
                   'dbpedia-entity': 0.35539,
                   'fever': 0.70594,
                   'fiqa': 0.31524,
                   'hotpotqa': 0.56278,
                   'msmarco': 0.70786,
                   'nfcorpus': 0.30557,
                   'nq': 0.49081,
                   'quora': 0.83627,
                   'scidocs': 0.13996,
                   'scifact': 0.57617,
                   'trec-covid': 0.56205,
                   'trec-robust04-desc': 0.42553,
                   'trec-robust04-title': 0.41011,
                   'webis-touche2020': 0.18861},
             0.1: {'arguana': 0.48534,
                   'climate-fever': 0.22046,
                   'dbpedia-entity': 0.36334,
                   'fever': 0.72911,
                   'fiqa': 0.32471,
                   'hotpotqa': 0.59238,
                   'msmarco': 0.71475,
                   'nfcorpu

In [23]:
with open("weight_sum_results/bm25_dense_ndcg_each_dataset.json", "w") as f:
    json.dump(weight_bm25_dense_ndcg, f)

In [24]:
all_weight_bm25_dense_ndcg = {}
for weight, ndcgs in weight_bm25_dense_ndcg.items():
    ndcgs = [value for key, value in ndcgs.items() if key not in {"msmarco", "trec-robust04-title"}]
    all_weight_bm25_dense_ndcg[weight] = np.round(np.mean(ndcgs), 4)
    
pprint(all_weight_bm25_dense_ndcg)

{0.0: 0.4398,
 0.1: 0.454,
 0.2: 0.465,
 0.3: 0.4751,
 0.4: 0.4837,
 0.5: 0.4902,
 0.6: 0.4928,
 0.7: 0.4883,
 0.8: 0.4716,
 0.9: 0.4431,
 1.0: 0.4102}


In [25]:
with open("weight_sum_results/bm25_dense_ndcg_ave-all.json", "w") as f:
    json.dump(all_weight_bm25_dense_ndcg, f)

In [26]:
weight_org_cbm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
    with futures.ProcessPoolExecutor(max_workers=2) as executer:
        furs = []
        for weight in range(11):
            weight *= 0.1
            furs.append((weight, executer.submit(weight_add_org_result_per_dataset, weight, qrels, cbm25_result, dense_result)))
        for weight, future in furs:
            ndcg10 = future.result()
            weight_org_cbm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10        

        
pprint(weight_org_cbm25_dense_ndcg)

  0%|          | 0/16 [00:00<?, ?it/s]

defaultdict(<class 'dict'>,
            {0.0: {'arguana': 0.48661,
                   'climate-fever': 0.20619,
                   'dbpedia-entity': 0.35539,
                   'fever': 0.70594,
                   'fiqa': 0.31524,
                   'hotpotqa': 0.56278,
                   'msmarco': 0.70786,
                   'nfcorpus': 0.30557,
                   'nq': 0.49081,
                   'quora': 0.83627,
                   'scidocs': 0.13996,
                   'scifact': 0.57617,
                   'trec-covid': 0.56205,
                   'trec-robust04-desc': 0.42553,
                   'trec-robust04-title': 0.41011,
                   'webis-touche2020': 0.18861},
             0.1: {'arguana': 0.4907,
                   'climate-fever': 0.24523,
                   'dbpedia-entity': 0.37946,
                   'fever': 0.7548,
                   'fiqa': 0.33877,
                   'hotpotqa': 0.6297,
                   'msmarco': 0.73441,
                   'nfcorpus':

In [27]:
with open("weight_sum_results/cbm25_dense_ndcg_each_dataset_org.json", "w") as f:
    json.dump(weight_org_cbm25_dense_ndcg, f)

In [1]:
dataset="bioask"
all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
weight = 0.5
ndcg10 = weight_add_org_result_per_dataset(weight, qrels, cbm25_result, dense_result)
ndcg10

NameError: name 'load_dataset_and_cbm25_dense_result' is not defined

In [28]:
all_weight_org_cbm25_dense_ndcg = {}
for weight, ndcgs in weight_org_cbm25_dense_ndcg.items():
    ndcgs = [value for key, value in ndcgs.items() if key not in {"msmarco", "trec-robust04-title"}]
    all_weight_org_cbm25_dense_ndcg[weight] = np.round(np.mean(ndcgs), 4)
    
pprint(all_weight_org_cbm25_dense_ndcg)

{0.0: 0.4398,
 0.1: 0.4779,
 0.2: 0.495,
 0.3: 0.504,
 0.4: 0.5068,
 0.5: 0.5055,
 0.6: 0.5025,
 0.7: 0.4995,
 0.8: 0.4961,
 0.9: 0.492,
 1.0: 0.4866}


In [29]:
with open("weight_sum_results/cbm25_dense_ndcg_ave-all_org.json", "w") as f:
    json.dump(all_weight_org_cbm25_dense_ndcg, f)

In [30]:
weight_cbm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
#     with futures.ProcessPoolExecutor(max_workers=3) as executer:
#         furs = []
#         for weight in range(11):
#             weight *= 0.1
#             furs.append((weight, executer.submit(add_result_per_dataset, qrels, cbm25_result, dense_result)))
#         for weight, future in furs:
#             ndcg10 = future.result()
#             weight_cbm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10
    for weight in range(11):
        weight *= 0.1
        ndcg10 = weight_add_result_per_dataset(weight, all_qids, qrels, cbm25_result, dense_result)
        weight_cbm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10


pprint(weight_cbm25_dense_ndcg)

  0%|          | 0/16 [00:00<?, ?it/s]

defaultdict(<class 'dict'>,
            {0.0: {'arguana': 0.48661,
                   'climate-fever': 0.20619,
                   'dbpedia-entity': 0.35539,
                   'fever': 0.70594,
                   'fiqa': 0.31524,
                   'hotpotqa': 0.56278,
                   'msmarco': 0.70786,
                   'nfcorpus': 0.30557,
                   'nq': 0.49081,
                   'quora': 0.83627,
                   'scidocs': 0.13996,
                   'scifact': 0.57617,
                   'trec-covid': 0.56205,
                   'trec-robust04-desc': 0.42553,
                   'trec-robust04-title': 0.41011,
                   'webis-touche2020': 0.18861},
             0.1: {'arguana': 0.4921,
                   'climate-fever': 0.24448,
                   'dbpedia-entity': 0.37716,
                   'fever': 0.7538,
                   'fiqa': 0.33766,
                   'hotpotqa': 0.62701,
                   'msmarco': 0.73383,
                   'nfcorpus'

In [31]:
with open("weight_sum_results/cbm25_dense_ndcg_each_dataset.json", "w") as f:
    json.dump(weight_cbm25_dense_ndcg, f)

In [12]:
datasets=["bioask", "nfcorpus", "trec-covid", "scidocs", "scifact"]
all_result = {}
for dataset in datasets:
    all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
    weight = 0.5
    result = weight_add_result_per_dataset_wrcap(weight, all_qids, qrels, cbm25_result, dense_result)
    all_result[dataset] = result
pprint(all_result)

{'bioask': {'ndcg': {'NDCG@1': 0.538, 'NDCG@10': 0.51793, 'NDCG@100': 0.58044},
            'rcap': {'R_cap@1': 0.538,
                     'R_cap@10': 0.55879,
                     'R_cap@100': 0.74038},
            'recall': {'Recall@1': 0.26085,
                       'Recall@10': 0.54073,
                       'Recall@100': 0.74038}},
 'nfcorpus': {'ndcg': {'NDCG@1': 0.40867,
                       'NDCG@10': 0.33133,
                       'NDCG@100': 0.29265},
              'rcap': {'R_cap@1': 0.42724,
                       'R_cap@10': 0.31168,
                       'R_cap@100': 0.29764},
              'recall': {'Recall@1': 0.05144,
                         'Recall@10': 0.16359,
                         'Recall@100': 0.28237}},
 'scidocs': {'ndcg': {'NDCG@1': 0.196, 'NDCG@10': 0.16509, 'NDCG@100': 0.22964},
             'rcap': {'R_cap@1': 0.196,
                      'R_cap@10': 0.16983,
                      'R_cap@100': 0.35945},
             'recall': {'Recall@1': 0.03993

In [32]:
# weight_cbm25_dense_ndcg = defaultdict(dict)
# datasets = ["arguana", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco", "nfcorpus", "nq",
#       "quora", "scidocs", "scifact", "trec-covid", "trec-robust04-title", "trec-robust04-desc", "webis-touche2020"]

# data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
# result_dir_root = "/home/gaia_data/iida.h/BEIR/C-BM25/results/"
# for dataset in tqdm(datasets):
#     print(dataset)
#     all_qids, qrels, cbm25_result, dense_result = load_dataset_and_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
#     for weight in range(11):
#         weight *= 0.1
#         ndcg10 = weight_add_result_per_dataset(weight, all_qids, qrels, cbm25_result, dense_result)
#         weight_cbm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10
            
# pprint(weight_cbm25_dense_ndcg)

In [33]:
all_weight_cbm25_dense_ndcg = {}
for weight, ndcgs in weight_cbm25_dense_ndcg.items():
    ndcgs = [value for key, value in ndcgs.items() if key not in {"msmarco", "trec-robust04-title"}]
    all_weight_cbm25_dense_ndcg[weight] = np.round(np.mean(ndcgs), 4)
    
pprint(all_weight_cbm25_dense_ndcg)

{0.0: 0.4398,
 0.1: 0.4764,
 0.2: 0.4929,
 0.3: 0.5023,
 0.4: 0.5066,
 0.5: 0.5052,
 0.6: 0.502,
 0.7: 0.4988,
 0.8: 0.4954,
 0.9: 0.4914,
 1.0: 0.4877}


In [34]:
with open("weight_sum_results/cbm25_dense_ndcg_ave-all.json", "w") as f:
    json.dump(all_weight_cbm25_dense_ndcg, f)

In [35]:
ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(qrels, dense_result, k_values=k_values)
ndcg

{'NDCG@1': 0.21429, 'NDCG@10': 0.18861, 'NDCG@100': 0.32733}

In [39]:
df_weight_bm25_dense_ndcg = pd.DataFrame(weight_bm25_dense_ndcg)
df_weight_bm25_dense_ndcg

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
arguana,0.48661,0.48534,0.46509,0.44774,0.42866,0.4157,0.40186,0.39135,0.37988,0.37085,0.36264
climate-fever,0.20619,0.22046,0.23385,0.24165,0.24364,0.23996,0.2326,0.221,0.20265,0.18173,0.15782
dbpedia-entity,0.35539,0.36334,0.37116,0.37867,0.38849,0.39846,0.40455,0.40345,0.38316,0.33899,0.28464
fever,0.70594,0.72911,0.74792,0.75909,0.76602,0.76793,0.75936,0.73941,0.70538,0.65207,0.57684
fiqa,0.31524,0.32471,0.33314,0.34015,0.35014,0.35065,0.34493,0.33394,0.30706,0.27286,0.23607
hotpotqa,0.56278,0.59238,0.6206,0.64319,0.65948,0.66961,0.67232,0.66633,0.64627,0.61048,0.56742
msmarco,0.70786,0.71475,0.7254,0.73537,0.74034,0.74021,0.73457,0.70865,0.68611,0.60624,0.50583
nfcorpus,0.30557,0.31107,0.31337,0.31595,0.32394,0.32848,0.33426,0.3313,0.32742,0.32012,0.3301
nq,0.49081,0.50212,0.5113,0.5132,0.50954,0.49793,0.47673,0.44229,0.39103,0.3158,0.24278
quora,0.83627,0.84693,0.85563,0.86436,0.87096,0.87673,0.87829,0.87622,0.86625,0.8451,0.78859


In [40]:
df_weight_bm25_dense_ndcg.T.idxmax()

arguana                0.0
climate-fever          0.4
dbpedia-entity         0.6
fever                  0.5
fiqa                   0.5
hotpotqa               0.6
msmarco                0.4
nfcorpus               0.6
nq                     0.3
quora                  0.6
scidocs                0.6
scifact                0.5
trec-covid             0.7
trec-robust04-title    0.8
trec-robust04-desc     0.6
webis-touche2020       1.0
dtype: float64

In [41]:
df_weight_cbm25_dense_ndcg = pd.DataFrame(weight_cbm25_dense_ndcg)
df_weight_cbm25_dense_ndcg

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
arguana,0.48661,0.4921,0.47757,0.46593,0.46067,0.45703,0.45305,0.45107,0.44989,0.44936,0.44868
climate-fever,0.20619,0.24448,0.25974,0.26261,0.26275,0.26105,0.25776,0.2562,0.25271,0.24857,0.24401
dbpedia-entity,0.35539,0.37716,0.3938,0.40318,0.40273,0.39607,0.39185,0.38662,0.38152,0.3736,0.36306
fever,0.70594,0.7538,0.77419,0.77882,0.77832,0.77453,0.77014,0.76669,0.76397,0.75928,0.75392
fiqa,0.31524,0.33766,0.34799,0.35511,0.35532,0.35174,0.34687,0.34388,0.33904,0.33208,0.32172
hotpotqa,0.56278,0.62701,0.65646,0.66801,0.66873,0.66876,0.66817,0.66763,0.66684,0.66538,0.66271
msmarco,0.70786,0.73383,0.7311,0.71593,0.71156,0.70487,0.70409,0.69518,0.68278,0.67138,0.66874
nfcorpus,0.30557,0.31434,0.31828,0.3222,0.32805,0.33133,0.33507,0.33925,0.34026,0.3403,0.3607
nq,0.49081,0.51529,0.5121,0.49933,0.48277,0.46713,0.4514,0.43684,0.42568,0.41681,0.40857
quora,0.83627,0.85828,0.86713,0.86972,0.87091,0.86794,0.86482,0.86083,0.85578,0.84956,0.84367


In [None]:
df_weight_cbm25_dense_ndcg.T.idxmax()

# Simcse

In [None]:
def load_dataset_and_simcse_bm25_dense_result(dataset, data_dir_root, result_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    
    bm25_result_path = os.path.join(result_dir_root, dataset, "result/bm25/analysis.json")
    dense_result_path = os.path.join(result_dir_root, dataset, "result/cos_sim/simcse/analysis.json")

    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    
    with open(dense_result_path) as f:
         dense_result = json.load(f)
            
    all_qids = qrels.keys()
    return (all_qids, qrels, bm25_result, dense_result)

In [None]:
def load_dataset_and_simcse_cbm25_dense_result(dataset, data_dir_root, result_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    cbm25_result_path = os.path.join(result_dir_root, dataset, "result/lss/simcse/analysis.json")
    dense_result_path = os.path.join(result_dir_root, dataset, "result/cos_sim/simcse/analysis.json")

    with open(cbm25_result_path) as f:
        cbm25_result = list(json.load(f).values())[0]
    
    with open(dense_result_path) as f:
        dense_result = json.load(f)
            
    all_qids = qrels.keys()
    return (all_qids, qrels, cbm25_result, dense_result)

In [18]:
weight_simcse_bm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, bm25_result, dense_result = load_dataset_and_simcse_bm25_dense_result(dataset, data_dir_root, result_dir_root)
    weight = 0.5
    ndcg10 = weight_add_result_per_dataset(weight, all_qids, qrels, bm25_result, dense_result)
    weight_simcse_bm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10


  0%|          | 0/16 [00:00<?, ?it/s]

In [19]:
weight_simcse_bm25_dense_ndcg

defaultdict(dict,
            {0.5: {'arguana': 0.36527,
              'climate-fever': 0.16358,
              'dbpedia-entity': 0.29008,
              'fever': 0.58445,
              'fiqa': 0.23681,
              'hotpotqa': 0.57229,
              'msmarco': 0.51167,
              'nfcorpus': 0.30174,
              'nq': 0.25095,
              'quora': 0.80461,
              'scidocs': 0.14061,
              'scifact': 0.66441,
              'trec-covid': 0.53538,
              'trec-robust04-title': 0.45282,
              'trec-robust04-desc': 0.41573,
              'webis-touche2020': 0.4627}})

In [21]:
weight_simcse_cbm25_dense_ndcg = defaultdict(dict)
for dataset in tqdm(datasets):
    all_qids, qrels, cbm25_result, dense_result = load_dataset_and_simcse_cbm25_dense_result(dataset, data_dir_root, result_dir_root)
    weight = 0.5
    ndcg10 = weight_add_result_per_dataset(weight, all_qids, qrels, cbm25_result, dense_result)
    weight_simcse_cbm25_dense_ndcg[round(weight, 2)][dataset] = ndcg10


  0%|          | 0/16 [00:00<?, ?it/s]

In [22]:
weight_simcse_cbm25_dense_ndcg

defaultdict(dict,
            {0.5: {'arguana': 0.46527,
              'climate-fever': 0.19916,
              'dbpedia-entity': 0.32758,
              'fever': 0.6653,
              'fiqa': 0.27699,
              'hotpotqa': 0.64212,
              'msmarco': 0.52741,
              'nfcorpus': 0.31746,
              'nq': 0.3378,
              'quora': 0.82081,
              'scidocs': 0.15574,
              'scifact': 0.69569,
              'trec-covid': 0.67654,
              'trec-robust04-title': 0.44068,
              'trec-robust04-desc': 0.44887,
              'webis-touche2020': 0.37021}})

# Test

In [3]:
k_values=[1,10,100]
test_qrel = {
    'q1': {
        'd1': 0,
        'd2': 1,
        'd3': 0,
    },
    'q2': {
        'd2': 1,
        'd3': 1,
    },
}

test_run = {
    'q1': {
        'd1': 1.0,
        'd2': 0.1,
        'd3': 1.5,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
    }
}

test_run2 = {
    'q1': {
        'd1': 1.0,
        'd2': 0.2,
        'd3': 1.5,
        'd4': 0.4,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
        'd4': 0.01,       
    }
}

ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run, k_values=k_values)
ndcg

{'NDCG@1': 0.0, 'NDCG@10': 0.59671, 'NDCG@100': 0.59671}

In [4]:
ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run2, k_values=k_values)
ndcg

{'NDCG@1': 0.0, 'NDCG@10': 0.56205, 'NDCG@100': 0.56205}

In [5]:
test_run3 = {
    'q1': {
        'd1': 1.0,
        'd2': 0.2,
        'd3': 1.5,
        'd4': 0.4,
        "d5": 0.5,
        "d6": 0.6,
        "d7": 0.7,
        "d8": 0.8,
        "d9": 0.9,
        "d10": 0.95,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
        'd4': 0.01,
        "d5": 0.02,
        "d6": 0.03,
        "d7": 0.04,
        "d8": 0.05,
        "d9": 0.06,
        "d10": 0.07,
    }
}

test_run4 = {
    'q1': {
        'd1': 1.0,
        'd2': 0.2,
        'd3': 1.5,
        'd4': 0.4,
        "d5": 0.5,
        "d6": 0.6,
        "d7": 0.7,
        "d8": 0.8,
        "d9": 0.9,
        "d10": 0.95,
        "d11": 0.5,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
        'd4': 0.01,
        "d5": 0.02,
        "d6": 0.03,
        "d7": 0.04,
        "d8": 0.05,
        "d9": 0.06,
        "d10": 0.07,
        "d11": 0.05,
        
    }
}

ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run3, k_values=k_values)
ndcg

{'NDCG@1': 0.0, 'NDCG@10': 0.49125, 'NDCG@100': 0.49125}

In [6]:
ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run4, k_values=k_values)
ndcg

{'NDCG@1': 0.0, 'NDCG@10': 0.34671, 'NDCG@100': 0.48618}

In [7]:
test_qrel = {
    'q1': {
        'd1': 0,
        'd2': 1,
        'd3': 0,
    },
    'q2': {
        'd2': 1,
        'd3': 1,
    },
    "q3":{
        "d4": 1,
    }
}

test_run = {
    'q1': {
        'd1': 1.0,
        'd2': 0.1,
        'd3': 1.5,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
    }
}

test_run2 = {
    'q1': {
        'd1': 1.0,
        'd2': 0.2,
        'd3': 1.5,
        'd4': 0.4,
    },
    'q2': {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
        'd4': 0.01,       
    },
    "q3": {
        'd1': 1.5,
        'd2': 0.2,
        'd3': 0.5,
        'd4': 0.01, 
    }
}

ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run, k_values=k_values)
print(ndcg)

ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run2, k_values=k_values)
print(ndcg)

    
test_run5 = weight_add_result(test_run, test_run2, {"q1", "q2", "q3"}, 1.0)
print(test_run5)
ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(test_qrel, test_run5, k_values=k_values)
ndcg

{'NDCG@1': 0.0, 'NDCG@10': 0.59671, 'NDCG@100': 0.59671}
{'NDCG@1': 0.0, 'NDCG@10': 0.51826, 'NDCG@100': 0.51826}
{'q1': {'d4': 0.099, 'd3': 1.5, 'd1': 1.0, 'd2': 0.1}, 'q2': {'d4': 0.199, 'd3': 0.5, 'd1': 1.5, 'd2': 0.2}}


{'NDCG@1': 0.0, 'NDCG@10': 0.59671, 'NDCG@100': 0.59671}

In [6]:
from sklearn.metrics import ndcg_score

In [26]:
y_true1 =  [[1,1,0,0,0,0,0,0,0,0],[0,1,1,0,0,0,0,0,0,0]]
y_true2 = [[1,1,0,0,0,0,0,0,0,0,0],[0,1,1,0,0,0,0,0,0,0,0]]

run1 = [[1.0, 0.0, 1.5],[1.5, 0.2, 0.5]]
run2 = [[1.0, 0.1, 1.5, 0.0],[1.5, 0.2, 0.5, 0.19]]

print(ndcg_score(y_true1, run1))
print(ndcg_score(y_true2, run2))

        

0.5967132018086354
0.5967132018086354
