In [1]:
from time import time
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from bert_retrieval import BERT
from beir.retrieval import models
import logging
import random

  from tqdm.autonotebook import tqdm


In [76]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

data_path = "data/nfcorpus"

corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

model = DRES(models.SentenceBERT("all-MiniLM-L6-v2"), batch_size=256, corpus_chunk_size=512*9999)
retriever_bert = EvaluateRetrieval(model, score_function="dot")

start_time = time()
results_bert = retriever_bert.retrieve(corpus, queries)
end_time = time()
print("Time taken to retrieve: {:.2f} seconds".format(end_time - start_time))

2024-12-20 22:58:36 - Loading Corpus...


  0%|          | 0/3633 [00:00<?, ?it/s]

2024-12-20 22:58:37 - Loaded 3633 TEST Documents.
2024-12-20 22:58:37 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants die

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-12-20 22:58:41 - Sorting Corpus by document length (Longest first)...
2024-12-20 22:58:41 - Scoring Function: Dot Product (dot)
2024-12-20 22:58:41 - Encoding Batch 1/1...


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Time taken to retrieve: 125.55 seconds


In [77]:
logging.info("Retriever evaluation for k in: {}".format(retriever_bert.k_values))
ndcg, _map, recall, precision = retriever_bert.evaluate(qrels, results_bert, retriever_bert.k_values)
mrr = retriever_bert.evaluate_custom(qrels, results_bert, retriever_bert.k_values, metric="mrr")
recall_cap = retriever_bert.evaluate_custom(qrels, results_bert, retriever_bert.k_values, metric="r_cap")
hole = retriever_bert.evaluate_custom(qrels, results_bert, retriever_bert.k_values, metric="hole")
logging.info("Retriever evaluation results:")
logging.info(f"  nDCG: {ndcg}")
logging.info(f"  Mean Average Precision: { _map}")
logging.info(f"  Recall: {recall}")
logging.info(f"  Precision: {precision}")
logging.info(f"  MRR: {mrr}")
logging.info(f"  Recall Cap: {recall_cap}")
logging.info(f"  Hole: {hole}")



2024-12-20 23:01:24 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-12-20 23:01:24 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-20 23:01:24 - 

2024-12-20 23:01:24 - NDCG@1: 0.3947
2024-12-20 23:01:24 - NDCG@3: 0.3635
2024-12-20 23:01:24 - NDCG@5: 0.3416
2024-12-20 23:01:24 - NDCG@10: 0.3159
2024-12-20 23:01:24 - NDCG@100: 0.2945
2024-12-20 23:01:24 - NDCG@1000: 0.3828
2024-12-20 23:01:24 - 

2024-12-20 23:01:24 - MAP@1: 0.0432
2024-12-20 23:01:24 - MAP@3: 0.0772
2024-12-20 23:01:24 - MAP@5: 0.0923
2024-12-20 23:01:24 - MAP@10: 0.1105
2024-12-20 23:01:24 - MAP@100: 0.1424
2024-12-20 23:01:24 - MAP@1000: 0.1568
2024-12-20 23:01:24 - 

2024-12-20 23:01:24 - Recall@1: 0.0432
2024-12-20 23:01:24 - Recall@3: 0.0905
2024-12-20 23:01:24 - Recall@5: 0.1196
2024-12-20 23:01:24 - Recall@10: 0.1550
2024-12-20 23:01:24 - Recall@100: 0.3115
2024-12-20 23:01:24 - Recall@1000: 0.6321

In [78]:
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown

def plot_metrics(k):
    metrics = ['nDCG', 'MAP', 'Recall', 'Precision', 'MRR', 'Recall Cap', 'Hole']
    values = [ndcg[f'NDCG@{k}'], _map[f'MAP@{k}'], recall[f'Recall@{k}'], precision[f'P@{k}'], mrr[f'MRR@{k}'], recall_cap[f'R_cap@{k}'], hole[f'Hole@{k}']]

    plt.figure(figsize=(10, 6))
    plt.barh(metrics, values, color=['#4CAF50', '#2196F3', '#FF5722', '#FFC107', '#9C27B0', '#00BCD4', '#E91E63'])
    plt.xlabel('Scores', fontsize=12)
    plt.ylabel('Metrics', fontsize=12)
    plt.title(f'BERT Evaluation Metrics with K = {k}', fontsize=14)
    plt.xlim(0, 1)

    for i, v in enumerate(values):
        plt.text(v + 0.01 if v < 0.9 else v - 0.1, i, f"{v:.2f}", va='center', fontsize=10, color='black')
    plt.savefig(f'bert_evaluation_metrics_k_{k}.png')
    plt.tight_layout()
    plt.show()

k_values = [1, 3, 5, 10, 100, 1000]
dropdown = Dropdown(options=k_values, value=1000, description='Select K')
interact(plot_metrics, k=dropdown);



interactive(children=(Dropdown(description='Select K', index=5, options=(1, 3, 5, 10, 100, 1000), value=1000),…

In [5]:
top_k = 10
query_id, ranking_scores = random.choice(list(results_bert.items()))
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
logging.info("Query : %s\n" % queries[query_id])
for rank in range(top_k):
    doc_id = scores_sorted[rank][0]

    logging.info("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

2024-12-20 20:29:10 - Query : cadaverine

2024-12-20 20:29:10 - Rank 1: MED-729 [Transfer of spinal cord material to subsequent bovine carcasses at splitting.] - During the slaughter process, cattle carcasses are split by sawing centrally down the vertebral column, resulting in contamination of each half with spinal cord material. Using a novel method based on a real-time PCR assay, we measured saw-mediated tissue transfer among carcasses. Up to 2.5% of the tissue recovered from each of the five subsequent carcasses by swabbing the split vertebral face came from the first carcass to be split; approximately 9 mg was spinal cord tissue. Under controlled conditions in an experimental abattoir, between 23 and 135 g of tissue accumulated in the saw after splitting five to eight carcasses. Of the total tissue recovered, between 10 and 15% originated from the first carcass, and between 7 and 61 mg was spinal cord tissue from the first carcass. At commercial plants in the United Kingdom, betwe

In [6]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
hostname = "http://localhost:9200"
index_name = "nfcorpus_key"
initialize = True

model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever = EvaluateRetrieval(model)
results = retriever.retrieve(corpus, queries)

2024-12-20 20:29:13 - Activating Elasticsearch....
2024-12-20 20:29:13 - Elastic Search Credentials: {'hostname': 'http://localhost:9200', 'index_name': 'nfcorpus_key', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2024-12-20 20:29:13 - Deleting previous Elasticsearch-Index named - nfcorpus_key
2024-12-20 20:29:14 - DELETE http://localhost:9200/nfcorpus_key [status:200 duration:0.416s]
2024-12-20 20:29:16 - Creating fresh Elasticsearch-Index named - nfcorpus_key
2024-12-20 20:29:16 - PUT http://localhost:9200/nfcorpus_key [status:200 duration:0.359s]


  0%|          | 1/3633 [00:00<20:44,  2.92docs/s]

2024-12-20 20:29:17 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.324s]


 28%|██▊       | 1001/3633 [00:00<00:01, 1601.66docs/s]

2024-12-20 20:29:17 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.251s]
2024-12-20 20:29:17 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.159s]


 41%|████▏     | 1501/3633 [00:00<00:01, 2025.75docs/s]

2024-12-20 20:29:17 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.155s]


 55%|█████▌    | 2001/3633 [00:01<00:00, 2092.55docs/s]

2024-12-20 20:29:18 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.210s]


 69%|██████▉   | 2501/3633 [00:01<00:00, 2230.31docs/s]

2024-12-20 20:29:18 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.180s]


  0%|          | 0/3633 [00:00<?, ?docs/s]             


2024-12-20 20:29:18 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.175s]
2024-12-20 20:29:18 - PUT http://localhost:9200/nfcorpus_key/_bulk [status:200 duration:0.055s]


que:  33%|███▎      | 1/3 [00:00<00:01,  1.90it/s]

2024-12-20 20:29:20 - POST http://localhost:9200/_msearch [status:200 duration:0.469s]


que:  67%|██████▋   | 2/3 [00:00<00:00,  2.35it/s]

2024-12-20 20:29:21 - POST http://localhost:9200/_msearch [status:200 duration:0.295s]


que: 100%|██████████| 3/3 [00:01<00:00,  2.62it/s]

2024-12-20 20:29:21 - POST http://localhost:9200/_msearch [status:200 duration:0.208s]





In [40]:
logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
mrr = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="mrr")
recall_cap = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="r_cap")
hole = retriever.evaluate_custom(qrels, results, retriever.k_values, metric="hole")

2024-12-20 21:26:52 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-12-20 21:26:52 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-20 21:26:52 - 

2024-12-20 21:26:52 - NDCG@1: 0.4497
2024-12-20 21:26:52 - NDCG@3: 0.4025
2024-12-20 21:26:52 - NDCG@5: 0.3770
2024-12-20 21:26:52 - NDCG@10: 0.3428
2024-12-20 21:26:52 - NDCG@100: 0.2894
2024-12-20 21:26:52 - NDCG@1000: 0.3206
2024-12-20 21:26:52 - 

2024-12-20 21:26:52 - MAP@1: 0.0594
2024-12-20 21:26:52 - MAP@3: 0.1005
2024-12-20 21:26:52 - MAP@5: 0.1133
2024-12-20 21:26:52 - MAP@10: 0.1297
2024-12-20 21:26:52 - MAP@100: 0.1542
2024-12-20 21:26:52 - MAP@1000: 0.1600
2024-12-20 21:26:52 - 

2024-12-20 21:26:52 - Recall@1: 0.0594
2024-12-20 21:26:52 - Recall@3: 0.1132
2024-12-20 21:26:52 - Recall@5: 0.1331
2024-12-20 21:26:52 - Recall@10: 0.1660
2024-12-20 21:26:52 - Recall@100: 0.2602
2024-12-20 21:26:52 - Recall@1000: 0.3900

In [8]:
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown

def plot_metrics(k):
    metrics = ['nDCG', 'MAP', 'Recall', 'Precision', 'MRR', 'Recall Cap', 'Hole']
    values = [ndcg[f'NDCG@{k}'], _map[f'MAP@{k}'], recall[f'Recall@{k}'], precision[f'P@{k}'], mrr[f'MRR@{k}'], recall_cap[f'R_cap@{k}'], hole[f'Hole@{k}']]

    plt.figure(figsize=(10, 6))
    plt.barh(metrics, values, color=['#4CAF50', '#2196F3', '#FF5722', '#FFC107', '#9C27B0', '#00BCD4', '#E91E63'])
    plt.xlabel('Scores', fontsize=12)
    plt.ylabel('Metrics', fontsize=12)
    plt.title(f'BM25 Evaluation Metrics with K = {k}', fontsize=14)
    plt.xlim(0, 1)

    for i, v in enumerate(values):
        plt.text(v + 0.01 if v < 0.9 else v - 0.1, i, f"{v:.2f}", va='center', fontsize=10, color='black')

    plt.tight_layout()
    plt.savefig(f'BM25_valuation_metrics_k_{k}.png')
    plt.show()

k_values = [1, 3, 5, 10, 100, 1000]
dropdown = Dropdown(options=k_values, value=1000, description='Select K')
interact(plot_metrics, k=dropdown);

interactive(children=(Dropdown(description='Select K', index=5, options=(1, 3, 5, 10, 100, 1000), value=1000),…

In [68]:
from beir.reranking.models import CrossEncoder
from beir.reranking import Rerank
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
reranker = Rerank(cross_encoder_model, batch_size=128)

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2024-12-20 22:19:10 - Use pytorch device: cpu


In [69]:
rerank_results = reranker.rerank(corpus, queries, results_bert, top_k=20)

2024-12-20 22:19:21 - Starting To Rerank Top-20....


Batches:   0%|          | 0/51 [00:00<?, ?it/s]

In [70]:
logging.info("Retriever evaluation for k in: {}".format( retriever.k_values))


2024-12-20 22:20:13 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]


In [71]:
ndcg, _map, recall, precision =  retriever.evaluate(qrels, rerank_results,  retriever.k_values)
mrr =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="mrr")
recall_cap =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="r_cap")
hole =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="hole")
logging.info("Retriever evaluation results:")
logging.info(f"  nDCG: {ndcg}")
logging.info(f"  Mean Average Precision: { _map}")
logging.info(f"  Recall: {recall}")
logging.info(f"  Precision: {precision}")
logging.info(f"  MRR: {mrr}")
logging.info(f"  Recall Cap: {recall_cap}")
logging.info(f"  Hole: {hole}")

2024-12-20 22:20:15 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-20 22:20:15 - 

2024-12-20 22:20:15 - NDCG@1: 0.4226
2024-12-20 22:20:15 - NDCG@3: 0.3720
2024-12-20 22:20:15 - NDCG@5: 0.3518
2024-12-20 22:20:15 - NDCG@10: 0.3187
2024-12-20 22:20:15 - NDCG@100: 0.2337
2024-12-20 22:20:15 - NDCG@1000: 0.2271
2024-12-20 22:20:15 - 

2024-12-20 22:20:15 - MAP@1: 0.0542
2024-12-20 22:20:15 - MAP@3: 0.0893
2024-12-20 22:20:15 - MAP@5: 0.1023
2024-12-20 22:20:15 - MAP@10: 0.1183
2024-12-20 22:20:15 - MAP@100: 0.1314
2024-12-20 22:20:15 - MAP@1000: 0.1314
2024-12-20 22:20:15 - 

2024-12-20 22:20:15 - Recall@1: 0.0542
2024-12-20 22:20:15 - Recall@3: 0.0981
2024-12-20 22:20:15 - Recall@5: 0.1196
2024-12-20 22:20:15 - Recall@10: 0.1493
2024-12-20 22:20:15 - Recall@100: 0.1889
2024-12-20 22:20:15 - Recall@1000: 0.1889
2024-12-20 22:20:15 - 

2024-12-20 22:20:15 - P@1: 0.4427
2024-12-20 22:20:15

In [72]:
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown

def plot_metrics(k):
    metrics = ['nDCG', 'MAP', 'Recall', 'Precision', 'MRR', 'Recall Cap', 'Hole']
    values = [ndcg[f'NDCG@{k}'], _map[f'MAP@{k}'], recall[f'Recall@{k}'], precision[f'P@{k}'], mrr[f'MRR@{k}'], recall_cap[f'R_cap@{k}'], hole[f'Hole@{k}']]

    plt.figure(figsize=(10, 6))
    plt.barh(metrics, values, color=['#4CAF50', '#2196F3', '#FF5722', '#FFC107', '#9C27B0', '#00BCD4', '#E91E63'])
    plt.xlabel('Scores', fontsize=12)
    plt.ylabel('Metrics', fontsize=12)
    plt.title(f'Reranking BERT Evaluation Metrics with K = {k}', fontsize=14)
    plt.xlim(0, 1)

    for i, v in enumerate(values):
        plt.text(v + 0.01 if v < 0.9 else v - 0.1, i, f"{v:.2f}", va='center', fontsize=10, color='black')
    plt.savefig(f'Reranking_BERT_evaluation_metrics_k_{k}.png')
    plt.tight_layout()
    plt.show()

k_values = [1, 3, 5, 10, 100, 1000]
dropdown = Dropdown(options=k_values, value=10, description='Select K')
interact(plot_metrics, k=dropdown);





interactive(children=(Dropdown(description='Select K', index=3, options=(1, 3, 5, 10, 100, 1000), value=10), O…

In [73]:
rerank_results = reranker.rerank(corpus, queries, results, top_k=20)

2024-12-20 22:20:26 - Starting To Rerank Top-20....


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [74]:
ndcg, _map, recall, precision =  retriever.evaluate(qrels, rerank_results,  retriever.k_values)
mrr =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="mrr")
recall_cap =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="r_cap")
hole =  retriever.evaluate_custom(qrels, rerank_results,  retriever.k_values, metric="hole")
logging.info("Retriever evaluation results:")
logging.info(f"  nDCG: {ndcg}")
logging.info(f"  Mean Average Precision: { _map}")
logging.info(f"  Recall: {recall}")
logging.info(f"  Precision: {precision}")
logging.info(f"  MRR: {mrr}")
logging.info(f"  Recall Cap: {recall_cap}")
logging.info(f"  Hole: {hole}")

2024-12-20 22:21:06 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-20 22:21:06 - 

2024-12-20 22:21:06 - NDCG@1: 0.4854
2024-12-20 22:21:06 - NDCG@3: 0.4190
2024-12-20 22:21:06 - NDCG@5: 0.3903
2024-12-20 22:21:06 - NDCG@10: 0.3511
2024-12-20 22:21:06 - NDCG@100: 0.2492
2024-12-20 22:21:06 - NDCG@1000: 0.2424
2024-12-20 22:21:06 - 

2024-12-20 22:21:06 - MAP@1: 0.0645
2024-12-20 22:21:06 - MAP@3: 0.1038
2024-12-20 22:21:06 - MAP@5: 0.1182
2024-12-20 22:21:06 - MAP@10: 0.1349
2024-12-20 22:21:06 - MAP@100: 0.1448
2024-12-20 22:21:06 - MAP@1000: 0.1448
2024-12-20 22:21:06 - 

2024-12-20 22:21:06 - Recall@1: 0.0645
2024-12-20 22:21:06 - Recall@3: 0.1134
2024-12-20 22:21:06 - Recall@5: 0.1385
2024-12-20 22:21:06 - Recall@10: 0.1681
2024-12-20 22:21:06 - Recall@100: 0.1903
2024-12-20 22:21:06 - Recall@1000: 0.1903
2024-12-20 22:21:06 - 

2024-12-20 22:21:06 - P@1: 0.5065
2024-12-20 22:21:06

In [75]:
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown

def plot_metrics(k):
    metrics = ['nDCG', 'MAP', 'Recall', 'Precision', 'MRR', 'Recall Cap', 'Hole']
    values = [ndcg[f'NDCG@{k}'], _map[f'MAP@{k}'], recall[f'Recall@{k}'], precision[f'P@{k}'], mrr[f'MRR@{k}'], recall_cap[f'R_cap@{k}'], hole[f'Hole@{k}']]

    plt.figure(figsize=(10, 6))
    plt.barh(metrics, values, color=['#4CAF50', '#2196F3', '#FF5722', '#FFC107', '#9C27B0', '#00BCD4', '#E91E63'])
    plt.xlabel('Scores', fontsize=12)
    plt.ylabel('Metrics', fontsize=12)
    plt.title(f'Reranking BM25 Evaluation Metrics with K = {k}', fontsize=14)
    plt.xlim(0, 1)

    for i, v in enumerate(values):
        plt.text(v + 0.01 if v < 0.9 else v - 0.1, i, f"{v:.2f}", va='center', fontsize=10, color='black')
    plt.savefig(f'Reranking_BM25_evaluation_metrics_k_{k}.png')
    plt.tight_layout()
    plt.show()

k_values = [1, 3, 5, 10, 100, 1000]
dropdown = Dropdown(options=k_values, value=10, description='Select K')
interact(plot_metrics, k=dropdown);


interactive(children=(Dropdown(description='Select K', index=3, options=(1, 3, 5, 10, 100, 1000), value=10), O…