In [1]:
from datasets import load_from_disk
import math 
import numpy as np
from tqdm import tqdm
from collections import Counter
import json 
# from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_from_disk
import math 
import numpy as np
from tqdm import tqdm
from collections import Counter, defaultdict
import json 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk 
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg)

  from .autonotebook import tqdm as notebook_tqdm


### Step 1: Load documents and queries

In [2]:
data = load_from_disk("dataset/LitSearch_corpus_clean")
data['full'].shape

(64183, 5)

In [3]:
documents={}    
for item in tqdm(data["full"], desc="Processing documents"):
    doc_id = str(item["corpusid"])
    contents = (item["title"] or "") + " " + (item["abstract"] or "")
    documents[doc_id] = contents

Processing documents:   0%|          | 0/64183 [00:00<?, ?it/s]

Processing documents: 100%|██████████| 64183/64183 [00:03<00:00, 19024.39it/s]


In [4]:
docids=[]
corpus=[]
for doc_id, doc in documents.items():
    docids.append(doc_id)
    corpus.append(doc)
len(corpus)

64183

In [5]:
dataset_query = load_from_disk("dataset/LitSearch_query") 
requests={}
for i in range(len(dataset_query['full'])):
    query_text = dataset_query['full'][i]['query']
    requests[i] = query_text.lower()
print(f"Total queries processed: {len(requests)}")

Total queries processed: 597


### step 2: ltc

In [6]:
ps=PorterStemmer()
def tokenize(text):
    """nltk tokenizer with stemming."""
    tokens = word_tokenize(text)
    tokens = [ps.stem(token) for token in tokens]
    return tokens 

In [7]:
df = defaultdict(int)

for docid, doc in tqdm(documents.items(), 'tokenizing documents...'):
    tokens = set(tokenize(doc))
    for term in tokens:
        df[term] += 1
N=len(documents)
print(f"Total terms: {len(df)} from {N} documents.")

idf = {term: np.log10(N / df_val) for term, df_val in df.items()}

tokenizing documents...:   0%|          | 34/64183 [00:00<03:16, 325.88it/s]

tokenizing documents...: 100%|██████████| 64183/64183 [01:44<00:00, 614.36it/s]


Total terms: 164654 from 64183 documents.


In [8]:
ltc_vectors = {}

#calculate tf-idf vector for each document
for docid, doc in tqdm(documents.items(), 'calculating tf-idf...'):
    tokens = tokenize(doc)
    tf_raw = Counter(tokens)

    # l and t weighting
    tfidf = {}
    for term, freq in tf_raw.items():
        tf = 1 + np.log10(freq)    
        tfidf[term] = tf * idf[term]  #lt

    # c normalization
    norm = np.sqrt(sum(v ** 2 for v in tfidf.values())) 
    if norm > 0:
        for term in tfidf:
            tfidf[term] /= norm

    ltc_vectors[docid] = tfidf

calculating tf-idf...: 100%|██████████| 64183/64183 [01:52<00:00, 570.79it/s]


### step 3: nnn query

In [9]:
def nnn_search(query, top_k=5):
    q_tokens=tokenize(query)
    q_tf = Counter(q_tokens)


    dists={}
    for docid, doc_vector in ltc_vectors.items():
        # nnn → just raw term frequency, no idf, no normalization
        nnn = 0
        for term, freq in q_tf.items():
            if term in doc_vector:
                nnn += freq * doc_vector[term]
        dists[docid] = nnn

    sorted_dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)        #todo: find indices
    tops=sorted_dists[:top_k] 
    top_idx=[docid for docid, _ in tops]
    top_scores=[score for _, score in tops]
    return top_idx, top_scores

In [10]:
for qid, query in tqdm(requests.items(), desc="Processing queries"):
    break 

fs=nnn_search(query, top_k=10)
fs

Processing queries:   0%|          | 0/597 [00:00<?, ?it/s]




(['219309121',
  '237563200',
  '33742593',
  '248227350',
  '221995575',
  '235258277',
  '243865637',
  '227746078',
  '173990423',
  '259370686'],
 [np.float64(1.3814108904644935),
  np.float64(0.9178282515170628),
  np.float64(0.8812249261866014),
  np.float64(0.8712767075052897),
  np.float64(0.8126378975426907),
  np.float64(0.7982506364706634),
  np.float64(0.7827143891964788),
  np.float64(0.7730031368617788),
  np.float64(0.7506077390560029),
  np.float64(0.7502439349060066)])

In [11]:
all_query_results={}
top_k = 50

for qid, query in tqdm(requests.items(), desc="Processing queries"):
    q_tokens=tokenize(query)
    q_tf = Counter(q_tokens)


    dists={}
    for docid, doc_vector in ltc_vectors.items():
        # nnn → just raw term frequency, no idf, no normalization
        nnn = 0
        for term, freq in q_tf.items():
            if term in doc_vector:
                nnn += freq * doc_vector[term]
        dists[docid] = nnn

    sorted_dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
    all_query_results[qid] = sorted_dists[:top_k]  

Processing queries: 100%|██████████| 597/597 [03:26<00:00,  2.88it/s]


In [12]:
all_query_results 

{0: [('219309121', np.float64(1.3814108904644935)),
  ('237563200', np.float64(0.9178282515170628)),
  ('33742593', np.float64(0.8812249261866014)),
  ('248227350', np.float64(0.8712767075052897)),
  ('221995575', np.float64(0.8126378975426907)),
  ('235258277', np.float64(0.7982506364706634)),
  ('243865637', np.float64(0.7827143891964788)),
  ('227746078', np.float64(0.7730031368617788)),
  ('173990423', np.float64(0.7506077390560029)),
  ('259370686', np.float64(0.7502439349060066)),
  ('235294276', np.float64(0.7293929893677025)),
  ('257219883', np.float64(0.7175875136043364)),
  ('237433629', np.float64(0.7093132137473472)),
  ('257038997', np.float64(0.6995268460004744)),
  ('251439133', np.float64(0.6864559281460902)),
  ('53601909', np.float64(0.676353771239769)),
  ('246706128', np.float64(0.6672907074714844)),
  ('201670719', np.float64(0.6668294463603377)),
  ('256461337', np.float64(0.6645764249352432)),
  ('247741658', np.float64(0.6642022795750373)),
  ('22421874', np.fl

In [13]:
method_name = "ltc_nnn"
version="_stage1"


output_file = f"{method_name}_v{version}.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid in all_query_results:
        results = all_query_results[qid]
        
        rank = 0
        for doc_id, score in results:
            rank += 1
            line = f"{qid} Q0 {doc_id} {rank} {score:.6f} {method_name}\n"
            # print(line.strip())
            out_f.write(line)
        
print(f"Results written to {output_file}")

Results written to ltc_nnn_v_stage1.run


In [14]:
# python evaluate_run.py --metric ndcg@50  --qrel litsearch.qrel --run ltc_nnn_v_stage1.run
#  Mean ndcg@50: 0.2015 ± 0.0030 (n=597)
# python evaluate_run.py --metric map  --qrel litsearch.qrel --run ltc_nnn_v_stage1.run
#  Mean map: 0.1286 ± 0.0025 (n=597)

### now apply ce

In [15]:
from sentence_transformers import CrossEncoder
from typing import List, Tuple

In [16]:
# results={}

# for qid in tqdm(requests):
#     query = requests[qid]
#     top_docs, top_scores = tfidf_search(query, top_n=100)
#     index2docids=[docids[i] for i in top_docs]
#     results[qid]= (index2docids, top_scores)

In [17]:
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(CROSS_ENCODER_MODEL)

In [18]:
def rerank_cross_encoder(model: CrossEncoder, query: str, candidates: List[int], docs: List[str]) -> List[Tuple[int, float]]:
    # Build (query, doc) pairs for the candidates
    pairs = [(query, docs[i]) for i in candidates]
    scores = model.predict(pairs, convert_to_numpy=True)
    order = np.argsort(-scores)  # descending
    return [(candidates[int(i)], float(scores[int(i)])) for i in order]

In [19]:
candidates = [doc_id for doc_id, _ in all_query_results.values().__iter__().__next__()]
candidates

['219309121',
 '237563200',
 '33742593',
 '248227350',
 '221995575',
 '235258277',
 '243865637',
 '227746078',
 '173990423',
 '259370686',
 '235294276',
 '257219883',
 '237433629',
 '257038997',
 '251439133',
 '53601909',
 '246706128',
 '201670719',
 '256461337',
 '247741658',
 '22421874',
 '258865530',
 '8842519',
 '215238664',
 '235436167',
 '221702970',
 '258212842',
 '241583522',
 '234482764',
 '234487239',
 '222290473',
 '247451141',
 '199453123',
 '235652233',
 '3643430',
 '11023355',
 '222132977',
 '240288605',
 '4956384',
 '102483587',
 '248780145',
 '258461336',
 '38267620',
 '211205183',
 '256598338',
 '3518190',
 '252873105',
 '842523',
 '204801691',
 '215416219']

In [20]:
results={}
for qid in tqdm(requests):
    query = requests[qid]
    top_docs, top_scores = nnn_search(query, top_k=200)
    
    # pairs=[(query, corpus[index]) for index in top_docs]
    pairs=[(query, documents[index]) for index in top_docs]

    scores = reranker.predict(pairs, convert_to_numpy=True)
    order = np.argsort(-scores)  # descending

    reranked_top_docs=[top_docs[i]  for i in order] 
    results[qid]= (reranked_top_docs, np.sort(-scores))

  0%|          | 0/597 [00:00<?, ?it/s]

100%|██████████| 597/597 [04:50<00:00,  2.05it/s]


In [21]:
order

array([  0,  21,  16, 102,  86,   7, 175,  17, 197, 120, 192,  97,  19,
       186,  30,   2,  34,  24, 150,  66,  89,  37,  29, 142, 190,   8,
         1,  70,  15,  74,  28, 196, 127, 105,  51,  22,  27,  84, 117,
        50,  67,  98,  42,  39, 191, 171, 118, 116, 101,  23, 144,  93,
        59,  32, 156,  88, 113, 114, 164,  77, 132,  80,  44,  61, 100,
        52, 162,  90,  54, 167, 185,  75, 111, 136,  43, 165, 104, 112,
        72,  78, 129, 195,  25,   9, 194,  69,  10, 189,  94,  13,  38,
       174, 119, 168, 106, 177, 181, 145,  33,  79, 131,  95, 180,  63,
         6,  35, 158,  41,  11, 108, 135, 134, 188,  82, 170,  62, 198,
        26,  83, 147, 152, 172,  57, 139, 141, 155, 169,  56,  76, 124,
        58,  18, 159, 103, 121,  55, 140, 179, 125,  60,  20,  92,  45,
       182, 199,  81,  14, 157, 126,  71, 128, 130, 183,  53,  36, 151,
       173, 115, 146, 107, 184, 153,  46, 166, 154, 122,  73,  65,  85,
       110, 163, 178,  48,  64,  12, 193, 176, 161, 160,   3, 18

In [22]:
reranked_top_docs

['261276856',
 '222140788',
 '214802971',
 '256459906',
 '256358823',
 '247763065',
 '246823323',
 '258833483',
 '258187051',
 '257220165',
 '250311114',
 '261682404',
 '6496936',
 '247011732',
 '248476097',
 '38234424',
 '252815987',
 '259316542',
 '263334587',
 '259937490',
 '256846467',
 '260091821',
 '245144350',
 '9655643',
 '247476275',
 '235254145',
 '235694304',
 '250390927',
 '247518847',
 '14057517',
 '247996596',
 '258557287',
 '256615188',
 '249395201',
 '252734897',
 '11741466',
 '251402961',
 '259376511',
 '15288676',
 '238198403',
 '256900618',
 '252780361',
 '252596252',
 '70350011',
 '3073252',
 '263671852',
 '237940211',
 '244048238',
 '19009822',
 '7031949',
 '252595883',
 '237592862',
 '1046547',
 '235253968',
 '258968157',
 '196211466',
 '226283774',
 '254044147',
 '246863713',
 '248478169',
 '85498775',
 '27174168',
 '216868459',
 '256846836',
 '244729050',
 '738435',
 '246867139',
 '237513354',
 '220446045',
 '252385981',
 '258378297',
 '252819478',
 '51780574',


In [23]:
# corpus
# documents

In [24]:
method_name = "ce"
request_id = 0


output_file = "ce2.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid, result in results.items(): 
       
        rank = 0
        for doc_id, score in zip(*result):
            rank += 1
            line = f"{request_id} Q0 {doc_id} {rank} {score:.6f} {method_name}\n" 
            out_f.write(line)
        
        request_id += 1

In [None]:
# python evaluate_run.py --metric ndcg@50  --qrel litsearch.qrel --run ce2.run
# Mean ndcg@50: 0.4523 ± 0.0072 (n=597)
# python evaluate_run.py --metric map  --qrel litsearch.qrel --run ce2.run
# Mean map: 0.3976 ± 0.0077 (n=597)

In [None]:
# python evaluate_run.py --metric ndcg@50  --qrel litsearch.qrel --run /home/ns1254/irg_final_project/run_files/ce2.run