In [1]:
import json
import os
import xml.etree.ElementTree as ET
from pathlib import Path

import ir_datasets
import ir_measures
from ir_measures import calc_aggregate
from ir_measures.measures import RR, P, nDCG
from pyserini.search.lucene import LuceneSearcher
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def build_clinical_trials_index(output_dir):
    """Build Lucene index from clinical trials 2021 dataset."""
    print("Building index from clinical trials 2021 dataset...")

    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Create temporary JSONL file for indexing
    docs_file = f"{output_dir}/docs.jsonl"

    # Load dataset
    clinical_trials = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

    # save qrels to file
    qrels_output_file = f"{output_dir}/qrels.txt"
    with open(qrels_output_file, "w", encoding="utf-8") as f:
        for qrel in clinical_trials.qrels_iter():
            f.write(f"{qrel.query_id}\t0\t{qrel.doc_id}\t{qrel.relevance}\n")
    print(f"Saved qrels to {qrels_output_file}\n")

    # Write documents to JSONL format
    print("Extracting documents...")
    doc_count = 0
    with open(docs_file, "w", encoding="utf-8") as f:
        for doc in clinical_trials.docs_iter():
            # Create document in Pyserini format
            doc_dict = {
                "id": doc.doc_id,
                "contents": f"{doc.title} {doc.condition} {doc.summary} {doc.detailed_description} {doc.eligibility}",
            }
            f.write(json.dumps(doc_dict) + "\n")
            doc_count += 1
            if doc_count % 10000 == 0:
                print(f"  Processed {doc_count} documents...")

    print(f"Total documents: {doc_count}")

    # Build index using Pyserini
    print("Building Lucene index...")
    cmd = f"python -m pyserini.index.lucene \
      --collection JsonCollection \
      --input {output_dir} \
      --index {output_dir}/index \
      --generator DefaultLuceneDocumentGenerator \
      --threads 4 \
      --storePositions --storeDocvectors --storeRaw"

    os.system(cmd)

    print(f"✅ Index built successfully at {output_dir}/index\n")
    return f"{output_dir}/index"

In [3]:
def load_topics_from_file(topic_file):
    """Load topics from XML file, extracting text directly from <topic> elements."""
    topics = {}
    tree = ET.parse(topic_file)
    root = tree.getroot()

    for topic in root.findall("topic"):
        topic_id = topic.get("number")
        # Get all text inside <topic>, including nested elements
        text_content = "".join(topic.itertext()).strip()
        topics[topic_id] = {"title": text_content}

    return topics


In [4]:
topic_files = {
    "original": [
        "../topics/topics2021_en.xml",
        "../topics/topics2021_el.xml",
        "../topics/topics2021_es.xml",
        "../topics/topics2021_eu.xml",
        "../topics/topics2021_tr.xml",
        "../topics/topics2021_pl.xml",
        "../topics/topics2021_bn.xml",
        "../topics/topics2021_it.xml",
    ],
    "backtranslated": [
        "../topics_src_to_eng/topics_from_ben_Beng.xml",
        "../topics_src_to_eng/topics_from_ell_Grek.xml",
        "../topics_src_to_eng/topics_from_eus_Latn.xml",
        "../topics_src_to_eng/topics_from_ita_Latn.xml",
        "../topics_src_to_eng/topics_from_pol_Latn.xml",
        "../topics_src_to_eng/topics_from_spa_Latn.xml",
        "../topics_src_to_eng/topics_from_tur_Latn.xml",
    ],
}

In [5]:
output_dir = "tmp/indexes/clinical_trials_2021"
index_path = f"{output_dir}/index"
if not os.path.exists(index_path):
    index_path = build_clinical_trials_index(output_dir=output_dir)
else:
    print(f"Using existing index at {index_path}\n")

Using existing index at tmp/indexes/clinical_trials_2021/index



In [6]:
qrels = ir_datasets.load("clinicaltrials/2021/trec-ct-2021").qrels

In [7]:
# Initialize searcher with our custom index
searcher = LuceneSearcher(index_path)
searcher.set_bm25(0.9, 0.4)

Feb 12, 2026 8:51:56 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [8]:
metrics_eval = [
    nDCG @ 10,
    RR(rel=2),
    P(rel=2) @ 10,
    nDCG @ 5,
    nDCG @ 1000,
]

In [None]:
results_dict = {}
for category, files in topic_files.items():
    for topic_file in files:
        lang = topic_file.split("_")[-1].replace(".xml", "")
        print(f"{'=' * 60}")
        print(f"Processing: {topic_file} (Language: {lang})")
        print(f"{'=' * 60}")

        # Load topics
        try:
            topics = load_topics_from_file(f"{os.getcwd()}/{topic_file}")
            print(f"Loaded {len(topics)} topics")
        except Exception as e:
            print(f"❌ Error loading topics: {e}\n")
            continue

        # Run retrieval
        all_results = []
        topics_with_results = 0

        for topic_id, topic in topics.items():
            try:
                hits = searcher.search(topic["title"], k=1000)
                if hits:
                    topics_with_results += 1
                for rank, hit in enumerate(hits):
                    all_results.append(
                        {
                            "query_id": str(topic_id),
                            "doc_id": hit.docid,
                            "score": hit.score,
                        }
                    )
            except Exception as e:
                print(f"⚠️  Error searching for topic {topic_id}: {e}")
                continue

        print(f"Retrieved results for {topics_with_results}/{len(topics)} topics")
        print(f"Total results: {len(all_results)}")

        # save run to file
        run_output_file = f"tmp/runs/bm25_clinical_trials_2021_{lang}.tsv"
        os.makedirs("tmp/runs", exist_ok=True)
        with open(run_output_file, "w", encoding="utf-8") as f:
            for r in all_results:
                f.write(
                    f"{r['query_id']}\tQ0\t{r['doc_id']}\t0\t{r['score']}\tBM25\n"
                )
        print(f"Saved run to {run_output_file}")

        if not all_results:
            print(f"❌ No results found for {topic_file}\n")
            continue

        run_namedtuples = ir_measures.read_trec_run(
            [(r["query_id"], r["doc_id"], r["score"]) for r in all_results]
        )
        run = {}
        for r in all_results:
            run[(r["query_id"], r["doc_id"])] = r["score"]

        run = ir_measures.read_trec_run(run_output_file)

        metrics = calc_aggregate(metrics_eval, qrels, run)

        print("Evaluation Results:")
        for measure, value in metrics.items():
            print(f"  {measure}: {value:.4f}")
        print("\n")

        results_dict[f"{lang}_{category}"] = metrics

Processing: ../topics/topics2021_en.xml (Language: en)
Loaded 75 topics


In [None]:
results_df = pd.DataFrame.from_dict(results_dict, orient="index")
print(results_df)
print(results_df.sort_index())

results_df.to_csv(
    "tmp/results/bm25_clinical_trials_2021_results.csv"
)