## Pipeline(lnc.nn TF-IDF) No Query Expasion

Please make sure you already have downloaded the "LitSearch_corpus_clean" and LitSearch_query" in the dataset folder <br>
To download dataset run getting_started/get_dataset.ipynb

### Set all paths

In [1]:
import os
from datasets import load_from_disk
from pathlib import Path

project_root = os.path.abspath(os.path.join(Path.cwd(), "..", ".."))
corpus_config= "LitSearch_corpus_clean"
query_config= "LitSearch_query"
dataset_dir = os.path.join(project_root, "dataset")
run_dir = os.path.join(project_root, "run_files")
stopwords_path = os.path.join(dataset_dir, "stopwords.txt")
corpus_path = os.path.join(dataset_dir, corpus_config)
query_path = os.path.join(dataset_dir, query_config)


print(f"Project root directory: {project_root}")
print(f"Dataset directory: {dataset_dir}")
print(f"Run files directory: {run_dir}")
print(f"Corpus path: {corpus_path}")
print(f"Query path: {query_path}")

import sys
sys.path.append(project_root)


Project root directory: /home/akash/UNH/CS853_IR/Project/irg_final_project
Dataset directory: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset
Run files directory: /home/akash/UNH/CS853_IR/Project/irg_final_project/run_files
Corpus path: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset/LitSearch_corpus_clean
Query path: /home/akash/UNH/CS853_IR/Project/irg_final_project/dataset/LitSearch_query


### Load dataset

In [2]:
corpus_full = load_from_disk(corpus_path)['full']
queries_full = load_from_disk(query_path)['full']
print(f"Corpus details: {corpus_full}")
print(f"Queries details: {queries_full}")

queries = [q['query'] for q in queries_full]
print(f"Number of queries loaded: {len(queries)}")

Corpus details: Dataset({
    features: ['corpusid', 'title', 'abstract', 'citations', 'full_paper'],
    num_rows: 64183
})
Queries details: Dataset({
    features: ['query_set', 'query', 'specificity', 'quality', 'corpusids'],
    num_rows: 597
})
Number of queries loaded: 597


### TF-IDF(lnc-nnn)
lnc > Document <br>
nnn > Expanded Query

In [3]:
from src.classes.query import Query
from src.classes.tf_idf import TF_IDF
from tqdm import tqdm

output_run_file = "tfidf_lnc_nnn.run"
runfile_method_tag = "tfidf_lnc_nnn"

index_path = os.path.join(project_root, "indexes", "pyserini_index")
tfidf = TF_IDF(index_path)
os.makedirs(run_dir, exist_ok=True)


def run_tfidf_lnc_nnn(query_texts, output_filename, method_name="tfidf_lnc_nnn", k=50):
    run_path = os.path.join(run_dir, output_filename)
    if os.path.exists(run_path):
        print(f"Run file {run_path} already exists. Skipping generation.")
        return
    with open(run_path, "w", encoding="utf8") as outf:
        for qid, text in enumerate(tqdm(query_texts, desc=f"Running {method_name}"), start=0):
            q_embed = Query(text).get_nnn(index_path)
            hits = tfidf.search(q_embed, "lnc", k)
            for rank, (docid, score) in enumerate(hits, start=1):
                outf.write(f"{qid} Q0 {docid} {rank} {score:.6f} {method_name}\n")
    print(f"Wrote {run_path}")

# Run for original queries
run_tfidf_lnc_nnn(queries, "tfidf_lnc_nnn.run")

# Run for thesaurus-expanded queries
# run_tfidf_lnc_nnn(expanded_query_texts, output_filename= output_run_file, method_name = runfile_method_tag)


Nov 30, 2025 5:45:49 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
Running tfidf_lnc_nnn: 100%|██████████| 597/597 [17:43<00:00,  1.78s/it]

Wrote /home/akash/UNH/CS853_IR/Project/irg_final_project/run_files/tfidf_lnc_nnn.run





## Evaluation

In [2]:
!cd ../.. && \
python3 evaluation/evaluate.py \
  --qrels evaluation/litsearch.qrel \
  --metric ndcg@50 \
  --output evaluation/results/tfidf_lnc_nnn \
  --runs run_files/tfidf_lnc_nnn.run

Summary (ndcg@50)
- tfidf_lnc_nnn: mean=0.228165, stderr=0.012864
Summary file written to: evaluation/results/tfidf_lnc_nnn/summary_ndcg@50.csv


In [4]:
!cd ../.. && \
python3 evaluation/evaluate.py \
  --qrels evaluation/litsearch.qrel \
  --metric map \
  --output evaluation/results/tfidf_lnc_nnn \
  --runs run_files/tfidf_lnc_nnn.run

Summary (map)
- tfidf_lnc_nnn: mean=0.163201, stderr=0.012397
Summary file written to: evaluation/results/tfidf_lnc_nnn/summary_map.csv
