In [7]:
from datasets import load_from_disk
import math 
import numpy as np
from tqdm import tqdm
from collections import Counter
import json 
from sklearn.feature_extraction.text import TfidfVectorizer

### Step 1: Load documents and queries

In [3]:
data = load_from_disk("dataset/LitSearch_corpus_clean")
data['full'].shape

(64183, 5)

In [4]:
documents={}    
for item in tqdm(data["full"], desc="Processing documents"):
    doc_id = str(item["corpusid"])
    contents = (item["title"] or "") + " " + (item["abstract"] or "")
    documents[doc_id] = contents

Processing documents: 100%|██████████| 64183/64183 [00:03<00:00, 19001.80it/s]


In [14]:
docids=[]
corpus=[]
for doc_id, doc in documents.items():
    docids.append(doc_id)
    corpus.append(doc)
len(corpus)

64183

In [6]:
dataset_query = load_from_disk("dataset/LitSearch_query") 
requests={}
for i in range(len(dataset_query['full'])):
    query_text = dataset_query['full'][i]['query']
    requests[i] = query_text.lower()
print(f"Total queries processed: {len(requests)}")

Total queries processed: 597


### step 2: sklearn methods

In [8]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",       # or "english"
    ngram_range=(1, 1),    # unigrams; change if you want bigrams/trigrams
)

In [11]:
X = vectorizer.fit_transform(corpus)

In [12]:
X.shape

(64183, 121688)

### Step 3: search

In [15]:
def tfidf_search(query, top_n=5):
    # Vectorize query using the *same* vectorizer
    q = vectorizer.transform([query])  # shape: (1, n_terms)

    # Cosine similarity because rows are L2-normalized:
    # cos_sim = X * q.T  (since ||X_i||=||q||=1)
    scores = (X @ q.T).toarray().ravel()  # shape: (n_docs,)

    # Get top_n docs
    top_idx = np.argsort(scores)[::-1][:top_n]
    return top_idx, scores[top_idx]

In [19]:
results={}

for qid in tqdm(requests):
    query = requests[qid]
    top_docs, top_scores = tfidf_search(query, top_n=50)
    index2docids=[docids[i] for i in top_docs]
    results[qid]= (index2docids, top_scores)

100%|██████████| 597/597 [00:07<00:00, 77.61it/s]


In [20]:
results[qid]

(['260091821',
  '256358823',
  '254044147',
  '258557287',
  '261276856',
  '250072949',
  '247011082',
  '245144350',
  '252815987',
  '249395201',
  '248476097',
  '259937490',
  '252668297',
  '246823323',
  '252596252',
  '256900618',
  '257220165',
  '252544798',
  '261582259',
  '202766449',
  '258967241',
  '256846836',
  '1877320',
  '244908617',
  '263909446',
  '247011539',
  '256390009',
  '263310331',
  '257255036',
  '256662277',
  '258833483',
  '256459906',
  '222140788',
  '263830894',
  '253018768',
  '238198403',
  '261682404',
  '256846467',
  '257505182',
  '5696027',
  '214802971',
  '258987792',
  '252734897',
  '252917661',
  '249626510',
  '256615188',
  '259376511',
  '108306764',
  '247748837',
  '235254145'],
 array([0.38432167, 0.37389917, 0.35688192, 0.35517256, 0.3480686 ,
        0.32155803, 0.31922398, 0.31802561, 0.3167818 , 0.31517764,
        0.3129188 , 0.30936904, 0.30893813, 0.3085916 , 0.30573575,
        0.2997836 , 0.29560475, 0.28062914, 0.279

In [28]:
method_name = "sklearn"
request_id = 0


output_file = "sklearn.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid, result in results.items(): 
       
        rank = 0
        for doc_id, score in zip(*result):
            rank += 1
            line = f"{request_id} Q0 {doc_id} {rank} {score:.6f} {method_name}\n" 
            out_f.write(line)
        
        request_id += 1

In [None]:
# python evaluate_run.py --metric ndcg@50  --qrel litsearch.qrel --run sklearn.run
#  Mean ndcg@50: 0.2996 ± 0.0050 (n=597)
# python evaluate_run.py --metric map  --qrel litsearch.qrel --run sklearn.run
#  Mean map: 0.2246 ± 0.0051 (n=597)

### now apply ce

In [34]:
from sentence_transformers import CrossEncoder
from typing import List, Tuple

In [29]:
results={}

for qid in tqdm(requests):
    query = requests[qid]
    top_docs, top_scores = tfidf_search(query, top_n=100)
    index2docids=[docids[i] for i in top_docs]
    results[qid]= (index2docids, top_scores)

100%|██████████| 597/597 [00:07<00:00, 78.18it/s]


In [32]:
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(CROSS_ENCODER_MODEL)

In [35]:
def rerank_cross_encoder(model: CrossEncoder, query: str, candidates: List[int], docs: List[str]) -> List[Tuple[int, float]]:
    # Build (query, doc) pairs for the candidates
    pairs = [(query, docs[i]) for i in candidates]
    scores = model.predict(pairs, convert_to_numpy=True)
    order = np.argsort(-scores)  # descending
    return [(candidates[int(i)], float(scores[int(i)])) for i in order]

In [None]:
candidates = [doc_id for doc_id, _ in first_stage]
candidates

In [None]:
reranked = rerank_cross_encoder(reranker, q, candidates, DOCS)  # [(doc_id, xenc_score)]

In [53]:
results={}
for qid in tqdm(requests):
    query = requests[qid]
    top_docs, top_scores = tfidf_search(query, top_n=100)
    
    pairs=[(query, corpus[index]) for index in top_docs]

    scores = reranker.predict(pairs, convert_to_numpy=True)
    order = np.argsort(-scores)  # descending

    reranked_top_docs=[top_docs[i].item() for i in order]
    index2docids=[docids[i] for i in reranked_top_docs]
    results[qid]= (index2docids, np.sort(-scores))

100%|██████████| 597/597 [01:35<00:00,  6.28it/s]


In [54]:
method_name = "ce"
request_id = 0


output_file = "ce.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid, result in results.items(): 
       
        rank = 0
        for doc_id, score in zip(*result):
            rank += 1
            line = f"{request_id} Q0 {doc_id} {rank} {score:.6f} {method_name}\n" 
            out_f.write(line)
        
        request_id += 1

In [None]:
# python evaluate_run.py --metric ndcg@50  --qrel litsearch.qrel --run ce.run
# Mean ndcg@50: 0.4589 ± 0.0071 (n=597)
# python evaluate_run.py --metric map  --qrel litsearch.qrel --run ce.run
# Mean map: 0.4017 ± 0.0076 (n=597)