In [1]:
import json
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from beir.datasets.data_loader import GenericDataLoader
from pyserini.analysis import Analyzer, get_lucene_analyzer
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/msmarco"
ms_corpus, ms_queries, ms_qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [3]:
analyzer = Analyzer(get_lucene_analyzer())

In [4]:
def get_corpus_freq(corpus):
    freq = Counter()
    for cid, doc in tqdm(corpus.items()):
        text = doc["title"] + " " + doc["text"]
        t_doc = analyzer.analyze(text)
        freq.update(t_doc)
        
    N = np.sum(list(freq.values()))
    for k in freq:
        freq[k] /= N
    return freq

In [5]:
msmarco_freq = get_corpus_freq(ms_corpus)

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [6]:
datasets = ["arguana", "bioask", "climate-fever", "fever", "dbpedia-entity", "fiqa", "hotpotqa", "nfcorpus", "nq", "quora", "scidocs", "scifact", "trec-covid", "trec-robust04-desc", "webis-touche2020"]
jac_sims_from_msmarco = {}
for dataset in datasets:
    data_path = f"/home/gaia_data/iida.h/BEIR/datasets/{dataset}"
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    freq = get_corpus_freq(corpus)
    all_vocab = set(list(freq.keys()) + list(msmarco_freq.keys()))
    jac_sim_nume = 0.0
    jac_sim_denom = 0.0
    for v in all_vocab:
        jac_sim_nume += np.min((freq.get(v, 0.0), msmarco_freq.get(v, 0.0)))
        jac_sim_denom += np.max((freq.get(v, 0.0), msmarco_freq.get(v, 0.0)))
    jac_sim = jac_sim_nume / jac_sim_denom
    jac_sims_from_msmarco[dataset] = jac_sim

  0%|          | 0/8674 [00:00<?, ?it/s]

  0%|          | 0/8674 [00:00<?, ?it/s]

  0%|          | 0/14914714 [00:00<?, ?it/s]

  0%|          | 0/14914604 [00:00<?, ?it/s]

  0%|          | 0/5416593 [00:00<?, ?it/s]

  0%|          | 0/5416593 [00:00<?, ?it/s]

  0%|          | 0/5416568 [00:00<?, ?it/s]

  0%|          | 0/5416568 [00:00<?, ?it/s]

  0%|          | 0/4635922 [00:00<?, ?it/s]

  0%|          | 0/4635922 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/5233329 [00:00<?, ?it/s]

  0%|          | 0/5233329 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

  0%|          | 0/2681468 [00:00<?, ?it/s]

  0%|          | 0/2681468 [00:00<?, ?it/s]

  0%|          | 0/522931 [00:00<?, ?it/s]

  0%|          | 0/522931 [00:00<?, ?it/s]

  0%|          | 0/25657 [00:00<?, ?it/s]

  0%|          | 0/25657 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

  0%|          | 0/528155 [00:00<?, ?it/s]

  0%|          | 0/528155 [00:00<?, ?it/s]

  0%|          | 0/382545 [00:00<?, ?it/s]

  0%|          | 0/382545 [00:00<?, ?it/s]

In [9]:
sorted(jac_sims_from_msmarco.items(), key=lambda x: -x[1])

[('nq', 0.5227529700174028),
 ('trec-robust04-desc', 0.4748049297056996),
 ('webis-touche2020', 0.410316363481542),
 ('fiqa', 0.4065016741115812),
 ('quora', 0.39517891692510543),
 ('arguana', 0.3847033117101627),
 ('climate-fever', 0.3841834695396349),
 ('fever', 0.384175878010209),
 ('hotpotqa', 0.3418787430686798),
 ('dbpedia-entity', 0.3340601872026008),
 ('scidocs', 0.3266259521409272),
 ('bioask', 0.3173238405914384),
 ('trec-covid', 0.3154687841692155),
 ('nfcorpus', 0.2849955868606359),
 ('scifact', 0.27297960167490715)]

In [10]:
[(k, round(v, 3)) for k, v in sorted(jac_sims_from_msmarco.items(), key=lambda x: -x[1])]

[('nq', 0.523),
 ('trec-robust04-desc', 0.475),
 ('webis-touche2020', 0.41),
 ('fiqa', 0.407),
 ('quora', 0.395),
 ('arguana', 0.385),
 ('climate-fever', 0.384),
 ('fever', 0.384),
 ('hotpotqa', 0.342),
 ('dbpedia-entity', 0.334),
 ('scidocs', 0.327),
 ('bioask', 0.317),
 ('trec-covid', 0.315),
 ('nfcorpus', 0.285),
 ('scifact', 0.273)]