### Use pyserini to retrieve relevant data

In [2]:
from datasets import load_from_disk
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
corpus = load_from_disk("LitSearch_corpus_clean")

In [None]:
## save corpus in a way that works with pyserini
output_dir = "corpus_jsonl"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "corpus.jsonl")

with open(output_path, "w", encoding="utf-8") as f:
    for item in corpus["full"]:
        doc_id = str(item["corpusid"])
        contents = (item["title"] or "") + " " + (item["abstract"] or "")
        f.write(json.dumps({"id": doc_id, "contents": contents}) + "\n")

print(" Saved corpus.jsonl for Pyserini indexing.")

 Saved corpus.jsonl for Pyserini indexing.


In [7]:
## get index using commandline 
import subprocess

index_dir = "pyserini_index"
os.makedirs(index_dir, exist_ok=True)
index_path = os.path.abspath(index_dir)

cmd = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", "/home/olagh48652/irg_course_assig/irg_project/corpus_jsonl",
    "--index", index_path,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "1",
    "--storePositions", "--storeDocvectors", "--storeRaw"
]

# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

# Print stdout and stderr
print(result.stdout)
print(result.stderr)

2025-10-15 13:57:00,868 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-10-15 13:57:00,877 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-10-15 13:57:00,878 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: /home/olagh48652/irg_course_assig/irg_project/corpus_jsonl
2025-10-15 13:57:00,879 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-10-15 13:57:00,880 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: /home/olagh48652/irg_course_assig/irg_project/pyserini_index
2025-10-15 13:57:00,881 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 1
2025-10-15 13:57:00,882 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
2025-10-15 13:57:00,978 INFO  [main] index.IndexCollection (IndexCollection.java:246

In [9]:
dataset_query = load_from_disk("LitSearch_query")

query_0 = dataset_query["full"][0]["query"]
print("query_0: ", query_0)

query_1 = dataset_query["full"][1]["query"]
print("query_1: ", query_1)

query_0:  Are there any research papers on methods to compress large-scale language models using task-agnostic knowledge distillation techniques?
query_1:  Are there any resources available for translating Tunisian Arabic dialect that contain both manually translated comments by native speakers and additional data augmented through methods like segmentation at stop words level?


In [11]:
from pyserini.search.lucene import LuceneSearcher
import json

# Initialize BM25 searcher
lucene_bm25_searcher = LuceneSearcher(index_path)
lucene_bm25_searcher.set_bm25(k1=0.9, b=0.4)  # Optional tuning

# Example: list of query dicts (replace with your real requests)
requests = [
    {"request_id": 1, "query": query_0},
    {"request_id": 2, "query": query_1}
]

# File to save top-3 results
output_file = "litsearch_top3_results.jsonl"

with open(output_file, "w", encoding="utf-8") as out_f:
    for req in requests:
        query_text = req["query"]
        hits = lucene_bm25_searcher.search(query_text, k=3)

        top3 = [
            {"doc_id": hit.docid, "score": hit.score, "rank": rank + 1}
            for rank, hit in enumerate(hits)
        ]

        out_f.write(json.dumps({
            "qid": req["request_id"],
            "query": query_text,
            "top3": top3
        }) + "\n")

print(f"✅ Top-3 results per query written to {output_file}")


Oct 15, 2025 2:04:24 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


✅ Top-3 results per query written to litsearch_top3_results.jsonl
