In [31]:
import json
from copy import deepcopy
from collections import defaultdict
from pathlib import Path

import numpy as np
from pytrec_eval import RelevanceEvaluator
from beir.datasets.data_loader import GenericDataLoader
from common import get_result_colbert, get_result, get_result_bm25

In [9]:
def get_result_bm25(path: Path) -> dict:
    with path.open(mode="r") as f:
        result = json.load(f)
    return result

In [10]:
def doc_dup_topk(result1, result2, topk=10):
    dup_rate_per_query = []
    for qid, d2score1 in result1.items():
        if qid not in result2:
            dup_rate = 0.0
        else:
            d2score2 = result2[qid]
            sorted_d2score1 = sorted(d2score1.items(), key=lambda x: -x[1])[:topk]
            sorted_d2score2 = sorted(d2score2.items(), key=lambda x: -x[1])[:topk]
            topk_did1 = set([d2s[0] for d2s in sorted_d2score1])
            topk_did2 = set([d2s[0] for d2s in sorted_d2score2])
            dup_rate = len(topk_did1 & topk_did2) / len(topk_did1 | topk_did2)
        dup_rate_per_query.append(dup_rate)
    return (np.mean(dup_rate_per_query), np.std(dup_rate_per_query, ddof=1))
            
        
            

In [108]:
def doc_dup_topk_rel(result1, result2, qrels, topk=10):
    def add_results(result1, result2):
        merge_result = deepcopy(result1)
        for qid, d2score in result2.items():
            if qid not in result1:
                merge_result[qid] = d2score
            for did, score in d2score.items():
                if did not in merge_result[qid]:
                    merge_result[qid][did] = score
                else:
                    merge_result[qid][did] += score
        return merge_result 
    
    dup_rate_per_query = []
    dup_rate_per_query_rel = []
    rel_q = set()
    s_only_q = set()
    t_only_q = set()
    non_rel_q = set()
    rel_q_ndcg = []
    s_only_q_ndcg = []
    t_only_q_ndcg = []
    non_rel_q_ndcg = []
    n_all_q = len(qrels)
    evaluator = RelevanceEvaluator(qrels, {'ndcg_cut'})
    merge_result = add_results(result1, result2)
    ndcg_result1 = evaluator.evaluate(result1)
    ndcg_result_m = evaluator.evaluate(merge_result)
    for qid, d2score in qrels.items():
        rel_did = set(d2score.keys())
        if qid not in result1 or qid not in result2:
            non_rel_q.add(qid)
            continue
        else:
            q_ndcg_result1 = ndcg_result1[qid]["ndcg_cut_10"]
            q_ndcg_result_m = ndcg_result_m[qid]["ndcg_cut_10"]
            q_ndcg_diff = q_ndcg_result_m - q_ndcg_result1
            d2score1 = result1[qid]
            d2score2 = result2[qid]
            sorted_d2score1 = sorted(d2score1.items(), key=lambda x: -x[1])
            sorted_d2score2 = sorted(d2score2.items(), key=lambda x: -x[1])[:topk]
            topk_did1 = set([d2s[0] for d2s in sorted_d2score1])
            topk_did2 = set([d2s[0] for d2s in sorted_d2score2])
            rel_topk_did1 = topk_did1 & rel_did
            rel_topk_did2 = topk_did2 & rel_did
            both_rel = len(rel_topk_did1 & rel_topk_did2)
            either_rel = len(rel_topk_did1 | rel_topk_did2)
            s_only_rel = len(rel_topk_did1 - rel_topk_did2)
            t_only_rel = len(rel_topk_did2 - rel_topk_did1)
            if not either_rel:
                non_rel_q.add(qid)
                non_rel_q_ndcg.append(q_ndcg_diff)
                continue
            elif both_rel:
                rel_q.add(qid)
                rel_q_ndcg.append(q_ndcg_diff)
            elif s_only_rel:
                s_only_q.add(qid)
                s_only_q_ndcg.append(q_ndcg_diff)
            else:
                t_only_q.add(qid)
                t_only_q_ndcg.append(q_ndcg_diff)
                t_only =rel_topk_did2 - rel_topk_did1
                print(q_ndcg_diff, [(t_d, merge_result[qid][t_d]) for t_d in t_only])
                print([s2d for s2d in sorted_d2score1 if s2d[0] in t_only])
                print([s2d for s2d in sorted_d2score2 if s2d[0] in t_only])
            
            dup_rate = len(topk_did1 & topk_did2) / len(topk_did1 | topk_did2)
            rel_dup_rate = both_rel / either_rel 
        dup_rate_per_query.append(dup_rate)
        dup_rate_per_query_rel.append(rel_dup_rate)
    doc_dup_info = {"dup-all": (round(np.mean(dup_rate_per_query),3), round(np.std(dup_rate_per_query, ddof=1),3)),
                "dup-rel": (round(np.mean(dup_rate_per_query_rel),3), round(np.std(dup_rate_per_query_rel, ddof=1),3))}
    q_dup_info = {"both": (len(rel_q), round(len(rel_q) / n_all_q, 3), round(np.mean(rel_q_ndcg),3)),
                  "s_only": (len(s_only_q), round(len(s_only_q) / n_all_q, 3), round(np.mean(s_only_q_ndcg), 3)),
                  "t_only": (len(t_only_q), round(len(t_only_q) /n_all_q, 3), round(np.mean(t_only_q_ndcg), 3)),
                  "non-rel": (len(non_rel_q), round(len(non_rel_q) / n_all_q, 3), round(np.mean(non_rel_q_ndcg), 3))
                 }
    return doc_dup_info, q_dup_info
            
        
            

In [48]:
def query_dup_topk(qrels, results, topk=10):
    def rel_q_in_topk(qrels, result, topk):
        in_rel = {"rel": set(), "not_rel": set()}
        for qid, t_d2score in qrels.items():
            t_d = set(t_d2score.keys())
            if qid in result:
                d2score = result[qid]
            else:
                in_rel["not_rel"].add(qid)
                continue
            sorted_d2score = sorted(d2score.items(), key=lambda x: -x[1])[:topk]
            topk_did = set([d2s[0] for d2s in sorted_d2score])
            if len(topk_did & t_d):
                in_rel["rel"].add(qid)
            else:
                in_rel["not_rel"].add(qid)
        return in_rel
    
    all_in_rel = {}
    for model_name, result in results.items():
        all_in_rel[model_name] = rel_q_in_topk(qrels, result, topk)
        
    all_rel = set(qrels.keys())
    n_all_rel = len(all_rel)
    all_rel_dup = defaultdict(dict)
    for s_model_name, s_in_rel in all_in_rel.items():
        for t_model_name, t_in_rel in all_in_rel.items():
            both = len(s_in_rel["rel"] & t_in_rel["rel"])
            s_only = len(s_in_rel["rel"] - t_in_rel["rel"])
            t_only = len(t_in_rel["rel"] - s_in_rel["rel"])
            non_rel = len(all_rel - s_in_rel["rel"] - t_in_rel["rel"])
            all_rel_dup[s_model_name][t_model_name] = {"both": (both, both / n_all_rel), "s_only": (s_only, s_only / n_all_rel), "t_only": (t_only, t_only /n_all_rel) , "non-rel": (non_rel, non_rel / n_all_rel)}
    return all_rel_dup
            

In [49]:
def topk_max_min(result, qrel, topk=10):
    # in_rel_all = {"all-max": [], "all-min": [], "rel-max": [], "rel-min": [], "not_rel-max": [],  "not_rel-min": []}
    in_rel_all = {"all-max": [], "all-min": [], "all-diff": []}
    in_rel = {}
    for qid, t_d2score in qrels.items():
        t_d = set(t_d2score.keys())
        if qid in result:
            d2score = result[qid]
        else:
            continue
        sorted_d2score = sorted(d2score.items(), key=lambda x: -x[1])[:topk]
        topk_did = set([d2s[0] for d2s in sorted_d2score])
        topk_max = sorted_d2score[0][1]
        topk_min = sorted_d2score[-1][1]
        in_rel_all["all-max"].append(topk_max)
        in_rel_all["all-min"].append(topk_min)
        in_rel_all["all-diff"].append(topk_max - topk_min)
        # if len(topk_did & t_d):
        #     in_rel_all["rel-max"].append(topk_max)
        #     in_rel_all["rel-min"].append(topk_min)
        # else:
        #     in_rel_all["not_rel-max"].append(topk_max)
        #     in_rel_all["not_rel-min"].append(topk_min)
    for k, v in in_rel_all.items():
        in_rel[k] = np.mean(v)
        
    return in_rel


In [50]:
def topk_diff_next(result, topk=10):
    next_diff = []
    for qid, d2score in result.items():
        sorted_d2score = sorted(d2score.items(), key=lambda x: -x[1])[:topk]
        prev = 0.0
        q_diff = []
        for d2s in sorted_d2score:
            if not prev:
                prev = d2s[1]
                continue
            diff = prev - d2s[1]
            q_diff.append(diff)
        if not q_diff:
            continue
        m_q_diff = np.mean(q_diff)
        next_diff.append(m_q_diff)
    m_next_diff = np.mean(next_diff)
    return m_next_diff

In [38]:
def top10_diff(result, qrel):
    all_diff = []
    rel_diff = []
    for qid, d2score in result.items():
        sorted_d2score = sorted(d2score.items(), key=lambda x: -x[1])[:10]
        if len(sorted_d2score) < 10:
            continue
        diff = sorted_d2score[0][1] - sorted_d2score[-1][1]
        all_diff.append(diff)
        
        if len([d2s[0] for d2s in sorted_d2score if d2s[0] in qrel[qid]]):
            rel_diff.append(diff)
            
        
    return np.mean(all_diff), np.mean(rel_diff)
        
        

In [39]:
datasets = ["nfcorpus", "trec-covid", "scidocs", "scifact"]
models_name = ["dense-org",  "splade-org"]
root_path = Path("/home/gaia_data/iida.h/BEIR/datasets")
model_root_dir = Path("/home/gaia_data/iida.h/BEIR/splade_vocab/evaluation")
colbert_root_dir = Path("/home/gaia_data/iida.h/BEIR/colbert/")

In [40]:
all_result = defaultdict(dict)
all_qrels = {}
for dataset in datasets:
    data_path = root_path / dataset
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
    all_qrels[dataset] = qrels
    bm25_dir = model_root_dir.joinpath(dataset, "bm25")
    bm25_path = list(bm25_dir.glob("**/analysis.json"))[0]
    bm25_result = get_result_bm25(bm25_path)
    all_result[dataset]["bm25"] = bm25_result
    for model_name in models_name:
        model_dir = model_root_dir.joinpath(dataset, model_name)
        model_path = list(model_dir.glob("**/analysis.json"))[0]
        model_result = get_result(model_path)       
        all_result[dataset][model_name] = model_result
        # dup = check_dup_w_bm25_top10(bm25_result, model_result)
        # all_dup[dataset][model_name] = dup
        
    if dataset in {"bioask", "nfcorpus", "trec-covid"}:
        colbert_dir = colbert_root_dir.joinpath("experiments", "DistilAdaLM-bio", dataset)
    elif dataset in {"scidocs", "scifact"}:
        colbert_dir = colbert_root_dir.joinpath("experiments", "DistilAdaLM-sci", dataset)
    if dataset == "bioask":
        colbert_path = list(colbert_dir.glob("*search/2023-01/01/**/*.tsv"))[0]
    else:
        colbert_path = list(colbert_dir.glob("*search/2022-12/29/**/*.tsv"))[0]
    colbert_dataset_dir = colbert_root_dir.joinpath("datasets", dataset)
    colbert_result = get_result_colbert(colbert_path, colbert_dataset_dir)
    all_result[dataset]["colbert"] = colbert_result
    # dup = check_dup_w_bm25_top10(bm25_result, colbert_result)
    # all_dup[dataset]["colbert"] = dup
        
    

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

  0%|          | 0/25657 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

In [109]:
all_doc_rel_dup = {dataset: defaultdict(dict) for dataset in datasets}
all_query_rel_dup = {dataset: defaultdict(dict) for dataset in datasets}
for dataset, results in all_result.items():
    qrels = all_qrels[dataset]
    for model_name1, result1 in results.items():
        for model_name2, result2 in results.items():
            if model_name2 == "bm25" and model_name1 == "dense-org":
                print(dataset)
                doc_dup, query_dup = doc_dup_topk_rel(result1, result2, qrels, 100)
                all_doc_rel_dup[dataset][model_name1][model_name2] = doc_dup
                all_query_rel_dup[dataset][model_name1][model_name2] = query_dup

nfcorpus
0.0 [('MED-4313', 3.035999059677124)]
[]
[('MED-4313', 3.035999059677124)]
trec-covid
scidocs
0.0 [('383ca85aaca9f306ea7ae04fb0b6b76f1e393395', 6.383900165557861)]
[]
[('383ca85aaca9f306ea7ae04fb0b6b76f1e393395', 6.383900165557861)]
0.0 [('691564e0f19d5f62597adc0720d0e51ddbce9b89', 4.130698204040527)]
[]
[('691564e0f19d5f62597adc0720d0e51ddbce9b89', 4.130698204040527)]
0.0 [('54ee58c3623b7fdf8b7e9e29355a24478f574eea', 7.860400199890137)]
[]
[('54ee58c3623b7fdf8b7e9e29355a24478f574eea', 7.860400199890137)]
0.0 [('0704bb7b7918cd512b5e66ea4b4993e50b8ae92f', 8.571200370788574)]
[]
[('0704bb7b7918cd512b5e66ea4b4993e50b8ae92f', 8.571200370788574)]
0.0 [('64ee1a00b500cc1924c58d2f3073f7ba653359bd', 5.7118000984191895)]
[]
[('64ee1a00b500cc1924c58d2f3073f7ba653359bd', 5.7118000984191895)]
0.0 [('92ac57eb4ed7a380b44bfc751824177326832449', 6.140699863433838)]
[]
[('92ac57eb4ed7a380b44bfc751824177326832449', 6.140699863433838)]
scifact
0.0 [('8290953', 8.809599876403809)]
[]
[('8290953', 

In [89]:
all_doc_rel_dup

{'nfcorpus': defaultdict(dict,
             {'bm25': {'dense-org': {'dup-all': (0.147, 0.111),
                'dup-rel': (0.539, 0.322)}}}),
 'trec-covid': defaultdict(dict,
             {'bm25': {'dense-org': {'dup-all': (0.12, 0.091),
                'dup-rel': (0.166, 0.093)}}}),
 'scidocs': defaultdict(dict,
             {'bm25': {'dense-org': {'dup-all': (0.182, 0.1),
                'dup-rel': (0.55, 0.383)}}}),
 'scifact': defaultdict(dict,
             {'bm25': {'dense-org': {'dup-all': (0.145, 0.072),
                'dup-rel': (0.875, 0.328)}}})}

In [90]:
all_query_rel_dup

{'nfcorpus': defaultdict(dict,
             {'bm25': {'dense-org': {'both': (237, 0.734, 0.026),
                's_only': (16, 0.05, -0.102),
                't_only': (17, 0.053, 0.006),
                'non-rel': (53, 0.164, 0.0)}}}),
 'trec-covid': defaultdict(dict,
             {'bm25': {'dense-org': {'both': (50, 1.0, 0.174),
                's_only': (0, 0.0, nan),
                't_only': (0, 0.0, nan),
                'non-rel': (0, 0.0, nan)}}}),
 'scidocs': defaultdict(dict,
             {'bm25': {'dense-org': {'both': (653, 0.653, 0.036),
                's_only': (134, 0.134, -0.034),
                't_only': (71, 0.071, 0.005),
                'non-rel': (142, 0.142, 0.0)}}}),
 'scifact': defaultdict(dict,
             {'bm25': {'dense-org': {'both': (254, 0.847, 0.052),
                's_only': (24, 0.08, -0.283),
                't_only': (11, 0.037, 0.0),
                'non-rel': (11, 0.037, 0.0)}}})}

In [148]:
all_max_min = {dataset: defaultdict(dict) for dataset in datasets}
for dataset, results in all_result.items():
    qrels = all_qrels[dataset]
    for model_name, result in results.items():
        all_max_min[dataset][model_name] = topk_max_min(result, qrels, 100)

In [149]:
all_max_min

{'nfcorpus': defaultdict(dict,
             {'bm25': {'all-max': 6.590889936143702,
               'all-min': 2.6047314705399724,
               'all-diff': 3.9861584656037294},
              'dense-org': {'all-max': 98.28113985799783,
               'all-min': 90.58089432849235,
               'all-diff': 7.700245529505491},
              'splade-org': {'all-max': 14.627990955169725,
               'all-min': 4.743741205233169,
               'all-diff': 9.884249749936556},
              'colbert': {'all-max': 19.90613377278803,
               'all-min': 12.583914113118553,
               'all-diff': 7.32221965966948}}),
 'trec-covid': defaultdict(dict,
             {'bm25': {'all-max': 12.134977970123291,
               'all-min': 8.46019570350647,
               'all-diff': 3.6747822666168215},
              'dense-org': {'all-max': 111.96072509765625,
               'all-min': 107.50048248291016,
               'all-diff': 4.460242614746094},
              'splade-org': {'all-max':

# 残骸

In [150]:
all_diff_next = {dataset: defaultdict(dict) for dataset in datasets}
for dataset, results in all_result.items():
    for model_name, result in results.items():
        all_diff_next[dataset][model_name] = topk_diff_next(result, 10)

In [142]:
all_diff_next

{'nfcorpus': defaultdict(dict,
             {'bm25': 1.9273199555804599,
              'dense-org': 3.4154916926680214,
              'splade-org': 4.200879156158928,
              'colbert': 2.8200586471190188}),
 'trec-covid': defaultdict(dict,
             {'bm25': 1.2420617283715143,
              'dense-org': 1.3722926500108508,
              'splade-org': 1.4703080685933432,
              'colbert': 0.7282026799519856}),
 'scidocs': defaultdict(dict,
             {'bm25': 3.053150831699371,
              'dense-org': 3.0126202418009442,
              'splade-org': 3.775889957533942,
              'colbert': 1.6391117045084633}),
 'scifact': defaultdict(dict,
             {'bm25': 7.430970640535708,
              'dense-org': 4.705268842909071,
              'splade-org': 9.724245428862396,
              'colbert': 2.5699158354158755})}

In [106]:
all_dup = {dataset: defaultdict(dict) for dataset in datasets}
for dataset, results in all_result.items():
    for model_name1, result1 in results.items():
        for model_name2, result2 in results.items():
            dup = doc_dup_topk(result1, result2, 100)
            all_dup[dataset][model_name1][model_name2] = dup

In [107]:
all_dup

{'nfcorpus': defaultdict(dict,
             {'bm25': {'bm25': (1.0, 0.0),
               'dense-org': (0.13855368919847513, 0.10950600931036651),
               'splade-org': (0.2530064329438162, 0.1741406795636098),
               'colbert': (0.2206851914742352, 0.16205460331566082)},
              'dense-org': {'bm25': (0.13211930734715277, 0.11084087198763913),
               'dense-org': (1.0, 0.0),
               'splade-org': (0.22396902144824404, 0.12698934370651468),
               'colbert': (0.1953825212617262, 0.11853433617972559)},
              'splade-org': {'bm25': (0.24125690819410334, 0.1782015134380032),
               'dense-org': (0.22396902144824404, 0.12698934370651468),
               'splade-org': (1.0, 0.0),
               'colbert': (0.2816412054787035, 0.177451803536901)},
              'colbert': {'bm25': (0.2104366531704782, 0.1649293467628078),
               'dense-org': (0.1953825212617262, 0.11853433617972559),
               'splade-org': (0.2816412054

In [111]:
all_rel_dup = {dataset: defaultdict(dict) for dataset in datasets}
for dataset, results in all_result.items():
    qrels = all_qrels[dataset]
    all_rel_dup[dataset] = query_dup_topk(qrels, results)
    
all_rel_dup

{'nfcorpus': defaultdict(dict,
             {'bm25': {'bm25': {'both': (222, 0.6873065015479877),
                's_only': (0, 0.0),
                't_only': (0, 0.0),
                'non-rel': (101, 0.3126934984520124)},
               'dense-org': {'both': (198, 0.6130030959752322),
                's_only': (24, 0.07430340557275542),
                't_only': (15, 0.04643962848297214),
                'non-rel': (86, 0.26625386996904027)},
               'splade-org': {'both': (212, 0.6563467492260062),
                's_only': (10, 0.030959752321981424),
                't_only': (13, 0.04024767801857585),
                'non-rel': (88, 0.2724458204334365)},
               'colbert': {'both': (205, 0.6346749226006192),
                's_only': (17, 0.05263157894736842),
                't_only': (16, 0.04953560371517028),
                'non-rel': (85, 0.2631578947368421)}},
              'dense-org': {'bm25': {'both': (198, 0.6130030959752322),
                's_only': (15

In [None]:
all_diff = defaultdict(dict)
all_dup = defaultdict(dict)
for dataset in datasets:
    data_path = root_path / dataset
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
    bm25_dir = model_root_dir.joinpath(dataset, "bm25")
    bm25_path = list(bm25_dir.glob("**/analysis.json"))[0]
    bm25_result = get_result_bm25(bm25_path)
    diff, rel_diff = top10_diff(bm25_result, qrels)
    all_diff[dataset]["bm25"] = {"all_diff": diff, "rel_diff": rel_diff}
    for model_name in models_name:
        model_dir = model_root_dir.joinpath(dataset, model_name)
        model_path = list(model_dir.glob("**/analysis.json"))[0]
        model_result = get_result(model_path)
        diff, rel_diff = top10_diff(model_result, qrels)
        all_diff[dataset][model_name] = {"all_diff": diff, "rel_diff": rel_diff}
        
        dup = check_dup_w_bm25_top10(bm25_result, model_result)
        all_dup[dataset][model_name] = dup
        
    # if dataset in {"bioask", "nfcorpus", "trec-covid"}:
    #     colbert_dir = colbert_root_dir.joinpath("experiments", "DistilAdaLM-bio", dataset)
    # elif dataset in {"scidocs", "scifact"}:
    #     colbert_dir = colbert_root_dir.joinpath("experiments", "DistilAdaLM-sci", dataset)
    # if dataset == "bioask":
    #     colbert_path = list(colbert_dir.glob("*search/2023-01/01/**/*.tsv"))[0]
    # else:
    #     colbert_path = list(colbert_dir.glob("*search/2022-12/29/**/*.tsv"))[0]
    colbert_dir = colbert_root_dir.joinpath("experiments", "Distilcolbert-self", dataset)
    colbert_path = list(colbert_dir.glob("*search/2022-12/29/**/*.tsv"))[0]
    colbert_dataset_dir = colbert_root_dir.joinpath("datasets", dataset)
    colbert_result = get_result_colbert(colbert_path, colbert_dataset_dir)
    diff, rel_diff = top10_diff(colbert_result, qrels)
    all_diff[dataset]["colbert"] = {"all_diff": diff, "rel_diff": rel_diff}
    dup = check_dup_w_bm25_top10(bm25_result, colbert_result)
    all_dup[dataset]["colbert"] = dup
        
    

  0%|          | 0/3633 [00:00<?, ?it/s]

NameError: name 'check_dup_w_bm25_top10' is not defined