In [1]:
import gc
import json
import os
import logging
from typing import Tuple, Dict
from collections import defaultdict
import numpy as np
import pytrec_eval
from scipy.stats import ttest_rel
from beir.retrieval.evaluation import EvaluateRetrieval

from beir.datasets.data_loader import GenericDataLoader

In [2]:
logger = logging.getLogger(__name__)

In [3]:
class QrelDataLoader(GenericDataLoader):
    def load(self, split="test") -> Tuple[Dict[str, str], Dict[str, Dict[str, int]]]:
        
        self.qrels_file = os.path.join(self.qrels_folder, split + ".tsv")
        self.check(fIn=self.query_file, ext="jsonl")
        self.check(fIn=self.qrels_file, ext="tsv")
        
        if not len(self.queries):
            logger.info("Loading Queries...")
            self._load_queries()
        
        if os.path.exists(self.qrels_file):
            self._load_qrels()
            self.queries = {qid: self.queries[qid] for qid in self.qrels}
            logger.info("Loaded %d %s Queries.", len(self.queries), split.upper())
            logger.info("Query Example: %s", list(self.queries.values())[0])
        
        return self.queries, self.qrels

In [56]:
def recall_cap(qrels, results, k_values):
    
    capped_recall = {qid: {} for qid in qrels}
    k_max = max(k_values)
    logging.info("\n")
    
    for query_id, doc_scores in results.items():
        top_hits = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]   
        query_relevant_docs = [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
        for k in k_values:
            retrieved_docs = [row[0] for row in top_hits[0:k] if qrels[query_id].get(row[0], 0) > 0]
            denominator = min(len(query_relevant_docs), k)
            capped_recall[query_id][f"R_cap@{k}"] = (len(retrieved_docs) / denominator)

    return capped_recall

In [57]:
def each_q_evaluate(qrels, results, k_values=[1, 10, 100]):
    map_string = "map_cut." + ",".join([str(k) for k in k_values])
    recall_string = "recall." + ",".join([str(k) for k in k_values])
    precision_string = "P." + ",".join([str(k) for k in k_values])
    ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {map_string, ndcg_string, recall_string, precision_string})
    # evaluator = pytrec_eval.RelevanceEvaluator(qrels, {ndcg_string})
    r_cap = recall_cap(qrels, results, k_values)
    scores = evaluator.evaluate(results)
    for qid in qrels:
        scores[qid].update(r_cap[qid])
    
    return scores

In [58]:
def pair_t_test(qrels, results1, results2, metric="ndcg_cut_10"):
    evaluate_result1 = each_q_evaluate(qrels, results1)
    evaluate_result2 = each_q_evaluate(qrels, results2)
    result1_scores = [evaluate_result1[qid][metric] if qid in evaluate_result1 else 0 for qid in qrels.keys()]
    result2_scores = [evaluate_result2[qid][metric] if qid in evaluate_result2 else 0 for qid in qrels.keys()]
    test_result = ttest_rel(result1_scores, result2_scores)
    return test_result

In [59]:
def test_per_dataset_ndcg(source_result, target_result, dataset, data_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    qids = list(qrels.keys())
    test_result = pair_t_test(qrels, source_result, target_result)
    return test_result

In [60]:
def test_per_dataset_recall(source_result, target_result, dataset, data_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    qids = list(qrels.keys())
    test_result = pair_t_test(qrels, source_result, target_result, metric="recall_100")
    return test_result

In [61]:
def test_per_dataset_recap(source_result, target_result, dataset, data_dir_root):
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")
    qids = list(qrels.keys())
    test_result = pair_t_test(qrels, source_result, target_result, metric="R_cap@100")
    return test_result

In [62]:
# def multiple_test(test_pvalues, alpha):
#     not_rejects = list()
#     all_ds = set(list(test_pvalues.keys()))
#     sorted_pvalues = sorted(test_pvalues.items(), key=lambda x: -x[1][1])
#     sorted_pvalues = [(ds, pvalue) for ds, pvalue in sorted_pvalues if ds not in {"msmarco", "trec-robust04-title"}]
#     M = len(sorted_pvalues)
#     q = sorted_pvalues[0][1][1]
#     for i, (ds, pvalue) in enumerate(sorted_pvalues):
#         fixed_pvalue = pvalue[1] * M / ((M-i) * sum([1/j for j in range(1,(M-i+1))]))
#         q = min(fixed_pvalue, q)
#         if fixed_pvalue < alpha:
#             break
#         not_rejects.append(ds)
#     return all_ds - set(not_rejects)

def multiple_test(test_pvalues, alpha):
    rejects = {}
    not_rejects = list()
    all_ds = {ds for ds in test_pvalues.keys() if ds not in {"msmarco", "trec-robust04-title"}}
    sorted_pvalues = sorted(test_pvalues.items(), key=lambda x: -x[1][1])
    sorted_pvalues = [(ds, pvalue) for ds, pvalue in sorted_pvalues if ds not in {"msmarco", "trec-robust04-title"}]
    M = len(sorted_pvalues)
    q = sorted_pvalues[0][1][1]
    for i, (ds, pvalue) in enumerate(sorted_pvalues):
        # fixed_pvalue = pvalue * M / ((M-i) * sum([1/j for j in range(1,(M-i+1))]))
        fixed_pvalue = pvalue[1] * M / (M-i)
        q = min(fixed_pvalue, q)
        if fixed_pvalue < alpha:
            break
        not_rejects.append(ds)
    return all_ds - set(not_rejects)

In [63]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
adasp_ndcg_test_pvalues = dict()
adasp_recall_test_pvalues = dict()
adaidfsp_ndcg_test_pvalues = dict()
adaidfsp_recall_test_pvalues = dict()

datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
for dataset in datasets:
    print(dataset)
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-org", vocab_num, "analysis.json")
    adalmidf_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    splade_result_path = os.path.join(result_dir_root, dataset, "splade-org", "analysis.json")
    with open(adalm_splade_result_path) as f:
        adalm_splade_result = list(json.load(f).values())[0]
    with open(adalmidf_splade_result_path) as f:
        adalmidf_splade_result = list(json.load(f).values())[0]
    with open(splade_result_path) as f:
        splade_result = list(json.load(f).values())[0]
    
    test_dataset_result = test_per_dataset_ndcg(adalm_splade_result, splade_result, dataset, data_dir_root)
    adasp_ndcg_test_pvalues[dataset] = test_dataset_result
    test_dataset_result = test_per_dataset_ndcg(adalmidf_splade_result, splade_result, dataset, data_dir_root)
    adaidfsp_ndcg_test_pvalues[dataset] = test_dataset_result
    
    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(adalm_splade_result, splade_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(adalm_splade_result, splade_result, dataset, data_dir_root)
    adasp_recall_test_pvalues[dataset] = test_dataset_result
    
    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(adalmidf_splade_result, splade_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(adalmidf_splade_result, splade_result, dataset, data_dir_root)
    adaidfsp_recall_test_pvalues[dataset] = test_dataset_result

nfcorpus
scidocs
scifact
trec-covid
bioask


In [64]:
print(adasp_ndcg_test_pvalues)
print(multiple_test(adasp_ndcg_test_pvalues, 0.05))
print(adaidfsp_ndcg_test_pvalues)
print(multiple_test(adaidfsp_ndcg_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=2.2475027586465606, pvalue=0.025284549406894334), 'scidocs': Ttest_relResult(statistic=0.14559175666800792, pvalue=0.8842730457903776), 'scifact': Ttest_relResult(statistic=2.517440709304737, pvalue=0.012343719038532113), 'trec-covid': Ttest_relResult(statistic=3.249898910382027, pvalue=0.0020896874399561173), 'bioask': Ttest_relResult(statistic=2.1439735848597805, pvalue=0.0325175475431364)}
{'nfcorpus', 'trec-covid', 'bioask', 'scifact'}
{'nfcorpus': Ttest_relResult(statistic=3.074237143057271, pvalue=0.0022909437283110882), 'scidocs': Ttest_relResult(statistic=1.9280313453301805, pvalue=0.054134403755170264), 'scifact': Ttest_relResult(statistic=1.515059563135391, pvalue=0.1308137212143696), 'trec-covid': Ttest_relResult(statistic=3.0200528767582964, pvalue=0.004006876213049558), 'bioask': Ttest_relResult(statistic=4.093979372578527, pvalue=4.9455518910711567e-05)}
{'nfcorpus', 'trec-covid', 'bioask'}


In [66]:
print(adasp_recall_test_pvalues)
print(multiple_test(adasp_recall_test_pvalues, 0.05))
print(adaidfsp_recall_test_pvalues)
print(multiple_test(adaidfsp_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=2.9129135061721327, pvalue=0.0038310038835629168), 'scidocs': Ttest_relResult(statistic=-0.7048405979042536, pvalue=0.4810737292408862), 'scifact': Ttest_relResult(statistic=1.8647954251982921, pvalue=0.06318964441695314), 'trec-covid': Ttest_relResult(statistic=6.053665917942173, pvalue=1.9345915652592484e-07), 'bioask': Ttest_relResult(statistic=4.500899981614259, pvalue=8.429606723964885e-06)}
{'nfcorpus', 'trec-covid', 'bioask'}
{'nfcorpus': Ttest_relResult(statistic=3.0926341083767483, pvalue=0.0021574598830007225), 'scidocs': Ttest_relResult(statistic=2.3009288378512673, pvalue=0.021600843731476464), 'scifact': Ttest_relResult(statistic=2.071681932321691, pvalue=0.039152389281036135), 'trec-covid': Ttest_relResult(statistic=5.942464669884818, pvalue=2.8699791583917415e-07), 'bioask': Ttest_relResult(statistic=5.893560776390349, pvalue=6.963168040698144e-09)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}


In [68]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
adadsp_ndcg_test_pvalues = dict()
adadsp_recall_test_pvalues = dict()

datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
for dataset in datasets:
    print(dataset)
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    if dataset != "bioask":
        adalm_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-d-idf", vocab_num, "analysis.json")
    else:
        adalm_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-idf", vocab_num, "analysis.json")
    splade_result_path = os.path.join(result_dir_root, dataset, "splade-d-org", "analysis.json")
    with open(adalm_splade_result_path) as f:
        adalm_splade_result = list(json.load(f).values())[0]
    with open(splade_result_path) as f:
        splade_result = list(json.load(f).values())[0]
    
    test_dataset_result = test_per_dataset_ndcg(adalm_splade_result, splade_result, dataset, data_dir_root)
    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(adalm_splade_result, splade_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(adalm_splade_result, splade_result, dataset, data_dir_root)
    adadsp_ndcg_test_pvalues[dataset] = test_dataset_result
    adadsp_recall_test_pvalues[dataset] = test_dataset_result

nfcorpus
scidocs
scifact
trec-covid
bioask


In [69]:
print(adadsp_ndcg_test_pvalues)
print(multiple_test(adadsp_ndcg_test_pvalues, 0.05))
print(adadsp_recall_test_pvalues)
print(multiple_test(adadsp_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=4.231366797729489, pvalue=3.0322649888833377e-05), 'scidocs': Ttest_relResult(statistic=5.284989663114507, pvalue=1.543547873262622e-07), 'scifact': Ttest_relResult(statistic=3.295200440047044, pvalue=0.0011018631833086623), 'trec-covid': Ttest_relResult(statistic=3.993788561291641, pvalue=0.00021773784912087866), 'bioask': Ttest_relResult(statistic=6.822299858447168, pvalue=2.6026269649668756e-11)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}
{'nfcorpus': Ttest_relResult(statistic=4.231366797729489, pvalue=3.0322649888833377e-05), 'scidocs': Ttest_relResult(statistic=5.284989663114507, pvalue=1.543547873262622e-07), 'scifact': Ttest_relResult(statistic=3.295200440047044, pvalue=0.0011018631833086623), 'trec-covid': Ttest_relResult(statistic=3.993788561291641, pvalue=0.00021773784912087866), 'bioask': Ttest_relResult(statistic=6.822299858447168, pvalue=2.6026269649668756e-11)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'

In [70]:
# def weight_add_result(bm25_result, dense_result, all_qids, weight1=0.5):
#     weight2 = 1 - weight1
#     result = {}
#     for qid in all_qids:
#         d_result1 = bm25_result.get(qid, None)
#         d_result2 = dense_result.get(qid, None)
#         result[qid] = {}
#         if not d_result1 and not d_result2:
#             continue
#         elif not d_result2:
#             result[qid] = d_result1
#             continue
#         elif not d_result1:
#             result[qid] = d_result2
#             continue
#         all_dids = set(list(d_result1.keys())) | set(list(d_result2.keys()))
#         try:
#             min_score1 = sorted(d_result1.values())[0]
#         except:
#             print(qid, d_result1)
#             raise ValueError()
#         min_score2 = sorted(d_result2.values())[0]
#         for did in all_dids:
#             d_score1 = d_result1.get(did, min_score1)
#             d_score2 = d_result2.get(did, min_score2)
#             result[qid][did] = weight1 * d_score1 + weight2 * d_score2
#     return result

MIN_DISCOUNT = 1e-3

def add_result(bm25_result, dense_result, top_k=100):
    result = {}
    all_qids = set(list(bm25_result.keys()))
    all_qids |= set(list(dense_result.keys()))
    for qid in all_qids:
        d_result1 = bm25_result.get(qid, None)
        d_result2 = dense_result.get(qid, None)
        if not d_result1 and not d_result2:
            continue
        elif not d_result2:
            d_result1 = sorted(d_result1.items(), key=lambda x: -x[1])[:top_k]
            result[qid] = {k: v for k, v in d_result1}
            continue
        elif not d_result1:
            d_result2 = sorted(d_result2.items(), key=lambda x: -x[1])[:top_k]
            result[qid] = {k: v for k, v in d_result2}
            continue
        d_result1 = {k: v for k, v in sorted(d_result1.items(), key=lambda x: -x[1])[:top_k]}
        d_result2 = {k: v for k, v in sorted(d_result2.items(), key=lambda x: -x[1])[:top_k]}
        # all_dids = set([did for did, _ in d_result1]) | set([did for did, _ in d_result2])
        all_dids = set(list(d_result1.keys())) | set(list(d_result2.keys()))
        result[qid] = {}
        try:
            min_score1 = sorted(d_result1.values())[0] - MIN_DISCOUNT
            # min_score1 = d_result1[-1][1] - MIN_DISCOUNT
        except:
            print(qid, d_result1)
            raise ValueError()
        min_score2 = sorted(d_result2.values())[0] - MIN_DISCOUNT
        # min_score2 = d_result2[-1][1] - MIN_DISCOUNT
        for did in all_dids:
            d_score1 = d_result1.get(did, min_score1)
            d_score2 = d_result2.get(did, min_score2)
            result[qid][did] = d_score1 + d_score2
    return result

In [71]:
def weight_add_result_per_dataset(weight, all_qids, qrels, bm25_result, dense_result):
    k_values = [1, 10, 100]
    results = weight_add_result(bm25_result, dense_result, all_qids, weight)
    ndcg, map_, recall, p = EvaluateRetrieval("").evaluate(qrels, results, k_values=k_values)
    return {"ndcg@10": ndcg["NDCG@10"], "recall@100": recall["Recall@100"]}

In [83]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
adahsp_ndcg_test_pvalues = dict()
adahsp_recall_test_pvalues = dict()
for dataset in datasets:
    print(dataset)
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    laprador_result_path = os.path.join(result_dir_root, "laprador", dataset, "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    splade_result_path = os.path.join(result_dir_root, dataset, "splade-org", "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = list(json.load(f).values())[0]
    with open(splade_result_path) as f:
        splade_result = list(json.load(f).values())[0]

    all_qids = list(qrels.keys())
    hybrid_results = add_result(bm25_result, adalm_splade_idf_result)
    test_dataset_result = test_per_dataset_ndcg(hybrid_results, splade_result, dataset, data_dir_root)
    adahsp_ndcg_test_pvalues[dataset] = test_dataset_result

    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(hybrid_results, splade_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(hybrid_results, splade_result, dataset, data_dir_root)
    adahsp_recall_test_pvalues[dataset] = test_dataset_result

nfcorpus
scidocs
scifact
trec-covid
bioask


In [84]:
print(adahsp_ndcg_test_pvalues)
print(multiple_test(adahsp_ndcg_test_pvalues, 0.05))
print(adahsp_recall_test_pvalues)
print(multiple_test(adahsp_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=3.6733800647349097, pvalue=0.00028021717824415346), 'scidocs': Ttest_relResult(statistic=3.5734497481444176, pvalue=0.00036913769302006024), 'scifact': Ttest_relResult(statistic=2.3934647627922225, pvalue=0.017306525898939305), 'trec-covid': Ttest_relResult(statistic=3.6750954386599894, pvalue=0.0005890403577777368), 'bioask': Ttest_relResult(statistic=7.1709240435230015, pvalue=2.7031038188938624e-12)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}
{'nfcorpus': Ttest_relResult(statistic=3.1727479901413473, pvalue=0.001655621342096557), 'scidocs': Ttest_relResult(statistic=3.4818145944548418, pvalue=0.00051958194636552), 'scifact': Ttest_relResult(statistic=2.452957819153296, pvalue=0.014739655055563345), 'trec-covid': Ttest_relResult(statistic=5.4119896754614265, pvalue=1.8573863125545907e-06), 'bioask': Ttest_relResult(statistic=8.458623973876955, pvalue=3.011577810327624e-16)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid

In [78]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
adahdsp_ndcg_test_pvalues = dict()
adahdsp_recall_test_pvalues = dict()
for dataset in datasets:
    print(dataset)
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    laprador_result_path = os.path.join(result_dir_root, "laprador", dataset, "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    if dataset != "bioask":
        adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-d-idf", vocab_num, "analysis.json")
    else:
        adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-idf", vocab_num, "analysis.json")
    splade_result_path = os.path.join(result_dir_root, dataset, "splade-d-org", "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = list(json.load(f).values())[0]
    with open(splade_result_path) as f:
        splade_result = list(json.load(f).values())[0]
        
    all_qids = list(qrels.keys())
    hybrid_results = add_result(bm25_result, adalm_splade_idf_result)
    test_dataset_result = test_per_dataset_ndcg(hybrid_results, splade_result, dataset, data_dir_root)
    adahdsp_ndcg_test_pvalues[dataset] = test_dataset_result

    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(hybrid_results, splade_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(hybrid_results, splade_result, dataset, data_dir_root)
    
    test_dataset_result = test_per_dataset_recall(hybrid_results, splade_result, dataset, data_dir_root)
    adahdsp_recall_test_pvalues[dataset] = test_dataset_result

nfcorpus
scidocs
scifact
trec-covid
bioask


In [79]:
print(adahdsp_ndcg_test_pvalues)
print(multiple_test(adahdsp_ndcg_test_pvalues, 0.05))
print(adahdsp_recall_test_pvalues)
print(multiple_test(adahdsp_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=3.823598676829075, pvalue=0.00015789647018340952), 'scidocs': Ttest_relResult(statistic=4.779132824968111, pvalue=2.0245031702181492e-06), 'scifact': Ttest_relResult(statistic=2.7534841509216013, pvalue=0.00625732939644317), 'trec-covid': Ttest_relResult(statistic=3.542631685906574, pvalue=0.0008809726240429204), 'bioask': Ttest_relResult(statistic=7.763008526999719, pvalue=4.7333293985210757e-14)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}
{'nfcorpus': Ttest_relResult(statistic=4.05196892452071, pvalue=6.371716496058156e-05), 'scidocs': Ttest_relResult(statistic=5.504699171997972, pvalue=4.7009654752664634e-08), 'scifact': Ttest_relResult(statistic=2.7550504242403075, pvalue=0.006228203612933054), 'trec-covid': Ttest_relResult(statistic=4.2062155668742935, pvalue=0.00011006107619234856), 'bioask': Ttest_relResult(statistic=9.466826013620615, pvalue=1.1449578804960093e-19)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}

In [80]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
laprador_ndcg_test_pvalues = dict()
lapradord_ndcg_test_pvalues = dict()
laprador_recall_test_pvalues = dict()
lapradord_recall_test_pvalues = dict()

for dataset in datasets:
    print(dataset)
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    laprador_result_path = os.path.join(result_dir_root, "laprador", dataset, "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    if dataset != "bioask":
        adalm_spladed_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-d-idf", vocab_num, "analysis.json")
    else:
        adalm_spladed_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-idf", vocab_num, "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = list(json.load(f).values())[0]
    with open(adalm_spladed_idf_result_path) as f:
        adalm_spladed_idf_result = list(json.load(f).values())[0]
    with open(laprador_result_path) as f:
        laprador_result = json.load(f)

    all_qids = list(qrels.keys())
    hybrid_results = add_result(bm25_result, adalm_splade_idf_result)
    hybridd_results = add_result(bm25_result, adalm_spladed_idf_result)
    
    test_dataset_result = test_per_dataset_ndcg(hybrid_results, laprador_result, dataset, data_dir_root)
    laprador_ndcg_test_pvalues[dataset] = test_dataset_result
    testd_dataset_result = test_per_dataset_ndcg(hybridd_results, laprador_result, dataset, data_dir_root)
    lapradord_ndcg_test_pvalues[dataset] = testd_dataset_result

    
    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(hybrid_results, laprador_result, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(hybrid_results, laprador_result, dataset, data_dir_root)
    laprador_recall_test_pvalues[dataset] = test_dataset_result
    
    if dataset == "trec-covid":
        testd_dataset_result = test_per_dataset_recap(hybridd_results, laprador_result, dataset, data_dir_root)
    else:
        testd_dataset_result = test_per_dataset_recall(hybridd_results, laprador_result, dataset, data_dir_root)
    lapradord_recall_test_pvalues[dataset] = testd_dataset_result


nfcorpus
scidocs
scifact
trec-covid
bioask


In [81]:
print(laprador_ndcg_test_pvalues)
print(multiple_test(laprador_ndcg_test_pvalues, 0.05))
print(lapradord_ndcg_test_pvalues)
print(multiple_test(lapradord_ndcg_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=1.210564346462779, pvalue=0.22695023798009156), 'scidocs': Ttest_relResult(statistic=-5.615570334291971, pvalue=2.538545620173972e-08), 'scifact': Ttest_relResult(statistic=1.400730036583805, pvalue=0.16233175420406637), 'trec-covid': Ttest_relResult(statistic=0.6675072939839094, pvalue=0.5075798516825706), 'bioask': Ttest_relResult(statistic=6.941679064870572, pvalue=1.2104560064882623e-11)}
{'scidocs', 'bioask'}
{'nfcorpus': Ttest_relResult(statistic=0.2699427610603684, pvalue=0.7873772074397768), 'scidocs': Ttest_relResult(statistic=-6.286964173849373, pvalue=4.830235861121876e-10), 'scifact': Ttest_relResult(statistic=1.187363693564348, pvalue=0.23602662048123896), 'trec-covid': Ttest_relResult(statistic=-3.052778517856003, pvalue=0.0036578456316423), 'bioask': Ttest_relResult(statistic=7.52241138855869, pvalue=2.5226847566732576e-13)}
{'scidocs', 'trec-covid', 'bioask'}


In [82]:
print(laprador_recall_test_pvalues)
print(multiple_test(laprador_recall_test_pvalues, 0.05))
print(lapradord_recall_test_pvalues)
print(multiple_test(lapradord_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=2.5133821006826884, pvalue=0.012445777186440592), 'scidocs': Ttest_relResult(statistic=-6.701301016965774, pvalue=3.446276555062554e-11), 'scifact': Ttest_relResult(statistic=2.5758320031921436, pvalue=0.010480038625340169), 'trec-covid': Ttest_relResult(statistic=5.867373573514401, pvalue=3.7440611348602545e-07), 'bioask': Ttest_relResult(statistic=9.619086290654423, pvalue=3.3069889998013343e-20)}
{'nfcorpus', 'scidocs', 'scifact', 'bioask', 'trec-covid'}
{'nfcorpus': Ttest_relResult(statistic=1.0441089932315812, pvalue=0.2972181744410112), 'scidocs': Ttest_relResult(statistic=-8.012058070176979, pvalue=3.1282422164995076e-15), 'scifact': Ttest_relResult(statistic=2.005035677295267, pvalue=0.045860346168231245), 'trec-covid': Ttest_relResult(statistic=1.3150843395149743, pvalue=0.19460286045006692), 'bioask': Ttest_relResult(statistic=10.00835821901801, pvalue=1.3023065135526875e-21)}
{'scidocs', 'bioask'}


In [93]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
cbm25_dir_root = "/home/gaia_data/iida.h/BEIR/C-BM25/results"

datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
hcbm25_ndcg_test_pvalues = dict()
hcbm25d_ndcg_test_pvalues = dict()
hcbm25_recall_test_pvalues = dict()
hcbm25d_recall_test_pvalues = dict()

for dataset in datasets:
    print(dataset)
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    cbm25_result_path = os.path.join(cbm25_dir_root, dataset, "result", "lss", "mpnet-tod", "analysis.json")
    cbm25_dense_result_path = os.path.join(cbm25_dir_root, dataset, "result", "dot", "mpnet-v3-mse-beir-dot", "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    if dataset != "bioask":
        adalm_spladed_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-d-idf", vocab_num, "analysis.json")
    else:
        adalm_spladed_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-tk-idf", vocab_num, "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = list(json.load(f).values())[0]
    with open(adalm_spladed_idf_result_path) as f:
        adalm_spladed_idf_result = list(json.load(f).values())[0]
    with open(cbm25_result_path) as f:
        cbm25_result = json.load(f)["maxsim_bm25_qtf"]
    with open(cbm25_dense_result_path) as f:
        cbm25_dense_result = json.load(f)

    all_qids = list(qrels.keys())
    hybrid_results = add_result(bm25_result, adalm_splade_idf_result)
    hybridd_results = add_result(bm25_result, adalm_spladed_idf_result)
    hcbm25_results = add_result(cbm25_result, cbm25_dense_result)
    
    test_dataset_result = test_per_dataset_ndcg(hybrid_results, hcbm25_results, dataset, data_dir_root)
    hcbm25_ndcg_test_pvalues[dataset] = test_dataset_result
    testd_dataset_result = test_per_dataset_ndcg(hybridd_results, hcbm25_results, dataset, data_dir_root)
    hcbm25d_ndcg_test_pvalues[dataset] = testd_dataset_result

    
    if dataset == "trec-covid":
        test_dataset_result = test_per_dataset_recap(hybrid_results, hcbm25_results, dataset, data_dir_root)
    else:
        test_dataset_result = test_per_dataset_recall(hybrid_results, hcbm25_results, dataset, data_dir_root)
    hcbm25_recall_test_pvalues[dataset] = test_dataset_result
    
    if dataset == "trec-covid":
        testd_dataset_result = test_per_dataset_recap(hybridd_results, hcbm25_results, dataset, data_dir_root)
    else:
        testd_dataset_result = test_per_dataset_recall(hybridd_results, hcbm25_results, dataset, data_dir_root)
    hcbm25d_recall_test_pvalues[dataset] = testd_dataset_result

nfcorpus
scidocs
scifact
trec-covid
bioask


In [94]:
print(hcbm25_ndcg_test_pvalues)
print(multiple_test(hcbm25_ndcg_test_pvalues, 0.05))
print(hcbm25d_ndcg_test_pvalues)
print(multiple_test(hcbm25d_ndcg_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=3.522577085727193, pvalue=0.0004891444762129307), 'scidocs': Ttest_relResult(statistic=-0.07936088152747584, pvalue=0.9367614762023505), 'scifact': Ttest_relResult(statistic=-0.20314600927633622, pvalue=0.8391590865845979), 'trec-covid': Ttest_relResult(statistic=-0.05456570867624501, pvalue=0.9567063337571217), 'bioask': Ttest_relResult(statistic=4.739362221259084, pvalue=2.8002560057267213e-06)}
{'nfcorpus', 'bioask'}
{'nfcorpus': Ttest_relResult(statistic=2.3632515110197376, pvalue=0.01870916765085692), 'scidocs': Ttest_relResult(statistic=-0.6007226734378505, pvalue=0.5481609749451911), 'scifact': Ttest_relResult(statistic=-0.5051653475251823, pvalue=0.6138148515108555), 'trec-covid': Ttest_relResult(statistic=-4.550106028161058, pvalue=3.548253667524291e-05), 'bioask': Ttest_relResult(statistic=4.84365026753713, pvalue=1.7036307367895056e-06)}
{'nfcorpus', 'trec-covid', 'bioask'}


In [95]:
print(hcbm25_recall_test_pvalues)
print(multiple_test(hcbm25_recall_test_pvalues, 0.05))
print(hcbm25d_recall_test_pvalues)
print(multiple_test(hcbm25d_recall_test_pvalues, 0.05))

{'nfcorpus': Ttest_relResult(statistic=1.8566121847699308, pvalue=0.06427950824337361), 'scidocs': Ttest_relResult(statistic=4.040549886533918, pvalue=5.741738297681192e-05), 'scifact': Ttest_relResult(statistic=1.0, pvalue=0.31811909747888467), 'trec-covid': Ttest_relResult(statistic=9.110122853677721, pvalue=4.042809975923777e-12), 'bioask': Ttest_relResult(statistic=6.49299063849586, pvalue=2.0357028080011217e-10)}
{'scidocs', 'trec-covid', 'bioask'}
{'nfcorpus': Ttest_relResult(statistic=0.07832381733978595, pvalue=0.9376191209495476), 'scidocs': Ttest_relResult(statistic=2.8037595655788627, pvalue=0.005148899491954889), 'scifact': Ttest_relResult(statistic=0.24273842465371434, pvalue=0.8083744668553937), 'trec-covid': Ttest_relResult(statistic=5.0840467135578535, pvalue=5.794996896097016e-06), 'bioask': Ttest_relResult(statistic=7.083795805746707, pvalue=4.800272519395655e-12)}
{'scidocs', 'trec-covid', 'bioask'}


In [None]:
gc.collect()

In [None]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid", "bioask"]
for dataset in datasets:
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-org", vocab_num, "analysis.json")
    adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_result_path) as f:
        adalm_splade_result = json.load(f)["org"]
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = json.load(f)["idf"]

    print(dataset, weight_add_org_result_per_dataset(0.5, qrels, bm25_result, adalm_splade_result))
    print(dataset, weight_add_org_result_per_dataset(0.5, qrels, bm25_result, adalm_splade_idf_result))


In [None]:
data_dir_root = "/home/gaia_data/iida.h/BEIR/datasets/"
result_dir_root = "/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation/"
    
datasets = ["nfcorpus", "scidocs", "scifact", "trec-covid"]
for dataset in datasets:
    data_dir = os.path.join(data_dir_root, dataset)
    queries, qrels = QrelDataLoader(data_folder=data_dir).load(split="test")

    bm25_result_path = os.path.join(result_dir_root, dataset, "bm25", "analysis.json")
    if dataset in ["bioask", "nfcorpus", "trec-covid"]:
        vocab_num = "71694"
    else:
        vocab_num = "62783"
    adalm_splade_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-org", vocab_num, "analysis.json")
    adalm_splade_idf_result_path = os.path.join(result_dir_root, dataset, "mlm-splade-idf", vocab_num, "analysis.json")
    with open(bm25_result_path) as f:
        bm25_result = json.load(f)
    with open(adalm_splade_result_path) as f:
        adalm_splade_result = json.load(f)["org"]
    with open(adalm_splade_idf_result_path) as f:
        adalm_splade_idf_result = json.load(f)["idf"]

    print(dataset, add_result_per_dataset(qrels, bm25_result, adalm_splade_result))
    print(dataset, add_result_per_dataset(qrels, bm25_result, adalm_splade_idf_result))


In [None]:
for dataset in datasets:
    print(dataset)
    laprador_result_path = os.path.join(result_dir_root, "laprador", dataset, "analysis.json")
    with open(laprador_result_path) as f:
        laprador_result = json.load(f)
    print(laprador_result)