BM25S Approach

https://github.com/xhluca/bm25s

In [None]:
#pip install bm25s

In [None]:
# Import necessary libraries
import json
import os
import re
import nltk
import math
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from collections import defaultdict

#from tokenizers import Tokenizer

import bm25s


# Import nltk data
# https://www.nltk.org/data.html
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')  # Ensure stopwords are downloaded


# set data path

data_path_abstract = r"c:\Users\Adrian\Development\air\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\documents"

# for dev
#data_path_abstract = r"c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort"


data_path_abstract_q = r"C:\Users\Adrian\Development\air\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract"

# os.path.join(data_path_abstract_q, file_name)
#data_folder = r"c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\documents"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hubin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import os
import json

class FolderLoader:
    def __init__(self, folder_path):
        self.folder_path = folder_path

    def __iter__(self):
        return FolderIterator(self.folder_path)


class FolderIterator:
    def __init__(self, folder_path):
        self.filepaths = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith('.jsonl')
        ]
        self.file_index = 0
        self.current_iterator = None

    def __iter__(self):
        return self

    def __next__(self):
        while self.file_index < len(self.filepaths):
            if self.current_iterator is None:
                filepath = self.filepaths[self.file_index]
                print(f"Processing file: {filepath}")
                self.current_iterator = DocumentIterator(filepath)

            try:
                return next(self.current_iterator)
            except StopIteration:
                self.current_iterator = None
                self.file_index += 1

        raise StopIteration


class DocumentIterator:
    def __init__(self, filepath):
        self.file = open(filepath, 'r', encoding='utf-8')

    def __iter__(self):
        return self

    def __next__(self):
        line = self.file.readline()
        if not line:
            self.file.close()
            raise StopIteration

        doc = json.loads(line)
        text = f"{doc.get('title', '')} {doc.get('abstract', '')}"
        authors_text = ' '.join([author.get('name', '').lower() for author in doc.get('authors', [])])
        text = f"{text} {authors_text}"
        id = doc.get('id')
        #return text
        # Return as a dictionary to preserver doc_id
        return {"id": id, "text": text}
            #'id': doc.get('id'),
            #'text': text
            

load data, tokenize 

In [15]:
#data_folder = r"c\Users\Adrian\Development\air\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\documents"
all_documents = FolderLoader(data_path_abstract)
#print(f"Loaded {len(all_documents)} documents from folder.")

In [16]:
# from nltk.tokenize.destructive import NLTKWordTokenizer
import numpy as np
import Stemmer

#corpus = all_documents
# get only text from dictionary
corpus = [doc["text"] for doc in all_documents]

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en", stemmer=stemmer)

# Create the BM25 model and index the corpus -> all_documents to preserve IDs
retriever = bm25s.BM25(corpus=all_documents)
retriever.index(corpus_tokens)

# save index with all_documts to preserve IDs
retriever.save("bm25_index", corpus=all_documents)


Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000001.jsonl
Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000002.jsonl
Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000021.jsonl


DEBUG:bm25s:Building index from IDs objects                              
                                                                                

Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000001.jsonl
Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000002.jsonl
Processing file: c:\Users\hubin\TULokal\AIRLocal\longeval_sci_training_2025_abstract\longeval_sci_training_2025_abstract\docShort\documents_000021.jsonl


Finding newlines for mmindex: 100%|██████████| 256M/256M [00:09<00:00, 27.8MB/s] 


In [18]:
#query = "retrieval quantum"
#tokenized_query = query.split(" ")
retriever = bm25s.BM25.load("bm25_index", mmap=True, load_corpus=True)

query = "retrieval quantum"
query_tokens = bm25s.tokenize(query, stemmer=stemmer)

results, scores = retriever.retrieve(query_tokens, k=2)

for i in range(results.shape[1]):
    doc, score = results[0, i], scores[0, i]
   
    print(f"Rank {i+1} (score: {score:.2f}): ID: {doc['id']}")

print(results)


                                                     

Rank 1 (score: 5.91): ID: 1051705
Rank 2 (score: 5.87): ID: 71415670
[[{'id': '1051705', 'text': 'Covariant Hamiltonian formalisms for particles and antiparticles The hyperplane and proper time formalisms are discussed mainly for the\nspin-half particles in the quantum case. A connection between these covariant\nHamiltonian formalisms is established. It is showed that choosing the\nspace-like hyperplanes instantaneously orthogonal to the direction of motion of\nthe particle the proper time formalism is retrieved on the mass shell. As a\nconsequence, the relation between the St\\"uckelberg-Feynman picture and the\nstandard canonical picture of quantum field theory is clarified.Comment: 19 pages, Latex, to be published in Int. J. Theor. Phy alvarez, edgardo t. garcia gaioli, fabian h.'}
  {'id': '71415670', 'text': 'Frames and Phase Retrieval Phase retrieval tackles the problem of recovering a signal after loss of phase. The phase problem shows up in many different settings such as X-ray



# Evaluate 

- Search Information includes i) unique (anonymous) identifiers for individual user session; ii) search query; iii) returned results.
- Click Information records, for each click, i) a unique (anonymous) identifier for individual user session; ii) the link that was clicked in the results list; iii) the position of clicked link in results list.

queries:
training queries
│-- queries.txt # Tab-separated plain text file with queries and IDs 
- ID, search query

qrels:
│-- qrels.txt # Relevance judgments file in TREC format 
click information 
- ID, datum, dokumentID, relevanz

(1) nDCG scores calculated on provided test sets. Such a classical evaluation measure is consistent with Web search, for which the discount emphasises the ordering of the top results.

(2) Relative nDCG Drop (RnD) measured by computing the difference between snapshots test sets. This measure supports the evaluation of the impact of the data changes on the systems’ results.

In [None]:
# Parse Queries.txt
def load_queries(filepath):
    queries = {}
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            query_id = parts[0]
            query_text = parts[1]
            queries[query_id] = query_text.split()  # Tokenize query
    return queries



# Parse qrels.txt
def load_qrels(filepath):
    qrels = defaultdict(dict)
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            query_id = parts[0]
            doc_id = parts[2]
            relevance = int(parts[3])
            qrels[query_id][doc_id] = relevance
    return qrels

# Load files
data_path_queries = os.path.join(data_path_abstract_q, "queries.txt")
data_path_qrels = os.path.join(data_path_abstract_q, "qrels.txt")

queries = load_queries(data_path_queries)
qrels = load_qrels(data_path_qrels)