## Pipeline(Thesaurus based Query Expansion > lnc.nn TF-IDF > LLM Rerank)

Please make sure you already have downloaded the "LitSearch_corpus_clean" and LitSearch_query" in the dataset folder <br>
To download dataset run getting_started/get_dataset.ipynb

### Set all paths

In [1]:
import os
from datasets import load_from_disk
from pathlib import Path

project_root = os.path.abspath(os.path.join(Path.cwd(), "..", ".."))
corpus_config= "LitSearch_corpus_clean"
query_config= "LitSearch_query"
dataset_dir = os.path.join(project_root, "dataset")
run_dir = os.path.join(project_root, "run_files")
stopwords_path = os.path.join(dataset_dir, "stopwords.txt")
corpus_path = os.path.join(dataset_dir, corpus_config)
query_path = os.path.join(dataset_dir, query_config)


print(f"Project root directory: {project_root}")
print(f"Dataset directory: {dataset_dir}")
print(f"Run files directory: {run_dir}")
print(f"Corpus path: {corpus_path}")
print(f"Query path: {query_path}")

import sys
sys.path.append(project_root)


Project root directory: /home/akash/UNH/CS853_IR/Project/irg_final_project
Dataset directory: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset
Run files directory: /home/akash/UNH/CS853_IR/Project/irg_final_project/run_files
Corpus path: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset/LitSearch_corpus_clean
Query path: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset/LitSearch_query


### Load dataset

In [2]:
corpus_full = load_from_disk(corpus_path)['full']
queries_full = load_from_disk(query_path)['full']
print(f"Corpus details: {corpus_full}")
print(f"Queries details: {queries_full}")

queries = [q['query'] for q in queries_full]
print(f"Number of queries loaded: {len(queries)}")

Corpus details: Dataset({
    features: ['corpusid', 'title', 'abstract', 'citations', 'full_paper'],
    num_rows: 64183
})
Queries details: Dataset({
    features: ['query_set', 'query', 'specificity', 'quality', 'corpusids'],
    num_rows: 597
})
Number of queries loaded: 597


### Query Expansion (Thesaurus)

In [3]:
expansion_path = os.path.join(dataset_dir, "expansions_thesaurus.jsonl")

# if expansions_thesaurus.jsonl already exists, load it directly
if os.path.exists(os.path.join(dataset_dir, "expansions_thesaurus.jsonl")):
    print("expansions_thesaurus.jsonl already exists.")
    import json
    expanded_queries = []
    with open(expansion_path, "r", encoding="utf8") as f:
        for line in f:
            item = json.loads(line)
            expanded_queries.append(item["expanded_query"])
    print(f"Loaded {len(expanded_queries)} expanded queries from expansions_thesaurus.jsonl")
else:
    from src.classes.query_expansion_thesaurus import QueryExpansionThesaurus
    qe = QueryExpansionThesaurus(max_synonyms_per_word=3, stopwords_path=stopwords_path)
    expanded_queries = qe.expand_queries(queries, save_path=expansion_path)
    print(f"Saved {len(expanded_queries)} expanded queries to expansions_thesaurus.jsonl")

# Ensure we have plain text strings for expanded queries
expanded_query_texts = [
    item["expanded_query"] if isinstance(item, dict) else item
    for item in expanded_queries
]

expansions_thesaurus.jsonl already exists.
Loaded 597 expanded queries from expansions_thesaurus.jsonl


### TF-IDF(lnc-nnn)
lnc > Document <br>
nnn > Expanded Query

In [4]:
from src.classes.query import Query
from src.classes.tf_idf import TF_IDF
from tqdm import tqdm

output_run_file = "tfidf_lnc_nnn_ThesaurusExp.run"
runfile_method_tag = "tfidf_lnc_nnn_ThesaurusExp"

index_path = os.path.join(project_root, "indexes", "pyserini_index")
tfidf = TF_IDF(index_path)
os.makedirs(run_dir, exist_ok=True)


def run_tfidf_lnc_nnn(query_texts, output_filename, method_name="tfidf_lnc_nnn", k=50):
    run_path = os.path.join(run_dir, output_filename)
    if os.path.exists(run_path):
        print(f"Run file {run_path} already exists. Skipping generation.")
        return
    with open(run_path, "w", encoding="utf8") as outf:
        for qid, text in enumerate(tqdm(query_texts, desc=f"Running {method_name}"), start=0):
            q_embed = Query(text).get_nnn(index_path)
            hits = tfidf.search(q_embed, "lnc", k)
            for rank, (docid, score) in enumerate(hits, start=1):
                outf.write(f"{qid} Q0 {docid} {rank} {score:.6f} {method_name}\n")
    print(f"Wrote {run_path}")

# Run for original queries
# run_tfidf_lnc_nnn(queries, "tfidf_lnc_nnn.run")

# Run for thesaurus-expanded queries
run_tfidf_lnc_nnn(expanded_query_texts, output_filename= output_run_file, method_name = runfile_method_tag)


Nov 30, 2025 1:03:21 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
Running tfidf_lnc_nnn_ThesaurusExp: 100%|██████████| 597/597 [24:18<00:00,  2.44s/it]

Wrote /home/akash/UNH/CS853_IR/Project/irg_final_project/run_files/tfidf_lnc_nnn_ThesaurusExp.run





### LLM Reranking

In [None]:
from src.classes.llm_reranker import LLMReranker

initial_run_path = os.path.join(run_dir, "tfidf_lnc_nnn_ThesaurusExp.run")
output_run_path = initial_run_path[:-4] + "_LLM-Rerank.run"  # append _LLM-Rerank before .run
corpus_path = os.path.join(project_root, "data", "corpus_jsonl", "corpus.jsonl") # path to corpus in jsonl format
log_path = os.path.join(project_root, "logs", output_run_path.split("/")[-1][:-4]+".jsonl") # logs llm reranker activity


reranker = LLMReranker()  # uses OLLAMA_MODEL or defaults to "llama3.1:8b-instruct-q8_0-16k"
reranker.rerank_runfile(
    initial_run_path= initial_run_path, 
    corpus_path     = corpus_path,
    output_run_path = output_run_path,
    queries         = expanded_query_texts,  # query list
    log_path        = log_path,
    top_k=20,
    run_tag="tfidf_lnc_nnn_ThesaurusExp_LLM-Rerank"
)

Using Ollama model: llama3.1:8b-instruct-q8_0-16k


LLM reranking: 100%|██████████| 597/597 [1:08:03<00:00,  6.84s/it]


## Evaluation

In [3]:
!cd ../.. && \
python3 evaluation/evaluate.py \
  --qrels evaluation/litsearch.qrel \
  --metric ndcg@50 \
  --output evaluation/results/lnc-nnn_ThesaurusExp_LLMRerank \
  --runs run_files/tfidf_lnc_nnn_ThesaurusExp_LLM-Rerank.run

Summary (ndcg@50)
- tfidf_lnc_nnn_ThesaurusExp_LLM-Rerank: mean=0.193371, stderr=0.014296
Summary file written to: evaluation/results/lnc-nnn_ThesaurusExp_LLMRerank/summary_ndcg@50.csv


In [4]:
!cd ../.. && \
python3 evaluation/evaluate.py \
  --qrels evaluation/litsearch.qrel \
  --metric map \
  --output evaluation/results/lnc-nnn_ThesaurusExp_LLMRerank \
  --runs run_files/tfidf_lnc_nnn_ThesaurusExp_LLM-Rerank.run

Summary (map)
- tfidf_lnc_nnn_ThesaurusExp_LLM-Rerank: mean=0.171006, stderr=0.013663
Summary file written to: evaluation/results/lnc-nnn_ThesaurusExp_LLMRerank/summary_map.csv
