In [1]:
import os
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import pandas
import pandas as pd
import seaborn as sns
from beir import util
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.search.lucene import LuceneSearcher
from pytrec_eval import RelevanceEvaluator
from tqdm.notebook import tqdm

from rsj_analysis import InvertedIndex, RSJCalculator, NumBinner, analyze

  from tqdm.autonotebook import tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
analyzer = Analyzer(get_lucene_analyzer())

In [3]:
os.makedirs("datasets", exist_ok=True)

dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(Path.cwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

In [4]:
data_path = f"./datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/3633 [00:00<?, ?it/s]

In [None]:
#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
dense_results = retriever.retrieve(corpus, queries)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [None]:
def hits_iterator(hits):
    rank = 1
    for hit in hits:
        docid = hit.docid.strip()
        yield docid, rank, hit.score, hit

        rank = rank + 1

In [None]:
searcher = LuceneSearcher.from_prebuilt_index('beir-v1.0.0-nfcorpus.flat')
bm25_results = defaultdict(dict)

top_k = 100
for qid, query in tqdm(queries.items()):
    hits = searcher.search(query, top_k, query_generator=None, fields=dict())
    for did, rank, score, _ in hits_iterator(hits):
        bm25_results[qid][did] = score

In [None]:
index = InvertedIndex(corpus, analyzer.analyze)
rsj_calculator = RSJCalculator(index, queries, qrels)
binner = NumBinner(rsj_calculator.rsj, 5)

df_bin_bm25 = analyze(bm25_results, rsj_calculator, binner)
df_bin_dense = analyze(dense_results, rsj_calculator, binner)

In [None]:
df_bin_bm25 = pd.concat([df_bin_bm25, pd.DataFrame({"retriever":["BM25"] * len(df_bin_bm25.index)})], axis=1)
df_bin_dense = pd.concat([df_bin_dense, pd.DataFrame({"retriever":["Dense"] * len(df_bin_dense.index)})], axis=1)
df_result = pd.concat([df_bin_bm25, df_bin_dense], ignore_index=True)
df_result["bin"] = df_result["bin"].apply(lambda x: str(x))
df_result

In [None]:
df_result["retriever"].unique()

In [None]:
bin_names = df_result["bin"].unique()
bin_names = sorted(bin_names, key=lambda x: x[0])
fig, ax = plt.subplots(figsize=(12, 6))
sns.set_style(style="whitegrid")
p = sns.boxplot(x='bin', y='ΔRSJ', hue="retriever", data=df_result, ax=ax)
# p = sns.boxplot(x=df_result['retriever'], y=df_result['ΔRSJ'], ax=ax)
p.set_xticklabels(bin_names, fontsize=16)
p.set_yticklabels(p.get_yticks(), fontsize=16)
p.set_xlabel("RSJ$_{t, Q}$", fontsize=2)
p.set_ylabel("$\Delta$ RSJ$_{t, Q}$", fontsize=20)
plt.legend(fontsize='x-large')
fig = plt.figure()
fig = p.get_figure()