In [1]:
from datasets import load_from_disk
import math 
import numpy as np
from tqdm import tqdm
from collections import Counter, defaultdict
import json 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk 
for pkg in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{pkg}")
    except LookupError:
        nltk.download(pkg)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_from_disk("LitSearch_corpus_clean")
documents={}    
for item in tqdm(data["full"], desc="Processing documents"):
    doc_id = str(item["corpusid"])
    contents = (item["title"].lower() or "") + " " + (item["abstract"].lower() or "")
    documents[doc_id] = contents

print(f"Total documents processed: {len(documents)}")

Processing documents: 100%|██████████| 64183/64183 [00:03<00:00, 18008.11it/s]

Total documents processed: 64183





In [3]:
documents['252715594']

'phenaki: variable length video generation from open domain textual descriptions we present phenaki, a model capable of realistic video synthesis, given a sequence of textual prompts. generating videos from text is particularly challenging due to the computational cost, limited quantities of high quality text-video data and variable length of videos. to address these issues, we introduce a new model for learning video representation which compresses the video to a small representation of discrete tokens. this tokenizer uses causal attention in time, which allows it to work with variable-length videos. to generate video tokens from text we are using a bidirectional masked transformer conditioned on pre-computed text tokens. the generated video tokens are subsequently de-tokenized to create the actual video. to address data issues, we demonstrate how joint training on a large corpus of image-text pairs as well as a smaller number of video-text examples can result in generalization beyond

In [4]:
stop_words=[
    'a',
    'an',
    'the',
    'are',
    'is'
]
def tokenize(text):
    """Simple whitespace + lowercase tokenizer."""
    tokens=[w  for w in text.split() if w not in stop_words] 
    return tokens 

In [5]:
ps=PorterStemmer()
def tokenize(text):
    """nltk tokenizer with stemming."""
    tokens = word_tokenize(text)
    tokens = [ps.stem(token) for token in tokens]
    return tokens 

### idf calculation

Document: ltc
Query: nnn 


$$
{l: } tf = 1 + log_{10}(f_{t,d}) \text{ if  }f_{t,d} > 0 , \text{ else } 0
$$

$$
{t: } 
idf = log_{10}\left( \frac{N}{df_t} \right)
$$

$$
\text{c: cosine normalization to unit length }
$$

In [6]:
df = defaultdict(int)

for docid, doc in tqdm(documents.items(), 'tokenizing documents...'):
    tokens = set(tokenize(doc))
    for term in tokens:
        df[term] += 1
N=len(documents)
print(f"Total terms: {len(df)} from {N} documents.")

idf = {term: np.log10(N / df_val) for term, df_val in df.items()}

tokenizing documents...: 100%|██████████| 64183/64183 [01:35<00:00, 671.23it/s]


Total terms: 165005 from 64183 documents.


In [7]:
ltc_vectors = {}

#calculate tf-idf vector for each document
for docid, doc in tqdm(documents.items(), 'calculating tf-idf...'):
    tokens = tokenize(doc)
    tf_raw = Counter(tokens)

    # l and t weighting
    tfidf = {}
    for term, freq in tf_raw.items():
        tf = 1 + np.log10(freq)    
        tfidf[term] = tf * idf[term]  #lt

    # c normalization
    norm = np.sqrt(sum(v ** 2 for v in tfidf.values())) 
    if norm > 0:
        for term in tfidf:
            tfidf[term] /= norm

    ltc_vectors[docid] = tfidf

calculating tf-idf...: 100%|██████████| 64183/64183 [01:43<00:00, 620.31it/s]


### nnn query

In [8]:
dataset_query = load_from_disk("LitSearch_query")
requests={}
for i in range(len(dataset_query['full'])):
    query_text = dataset_query['full'][i]['query']
    requests[i] = query_text.lower()
print(f"Total queries processed: {len(requests)}")

Total queries processed: 597


In [9]:
ltc_vectors.keys()

dict_keys(['252715594', '13002849', '239998253', '62841605', '253237531', '222291443', '223956716', '263605472', '212996548', '202719276', '220665539', '263152628', '264802502', '227068701', '251732759', '253523474', '162184036', '264555396', '251341969', '255340742', '53467348', '253801963', '249888901', '231632937', '246904522', '252846609', '257834209', '52980218', '232257804', '263831863', '5763832', '261245530', '21850704', '239009555', '43939886', '7305965', '261697392', '3536139', '220302524', '252596001', '259095643', '247595088', '259342096', '21196492', '247446857', '254926490', '52912260', '3535369', '263834989', '247595243', '252683543', '246240237', '263671656', '202660778', '235293695', '233378598', '238582772', '253116642', '252668582', '220302148', '245828046', '211132990', '213938729', '259952484', '259375870', '222141728', '252668746', '247476364', '253080406', '259833441', '238583049', '252439127', '173991084', '232075892', '213529244', '231918471', '7942973', '16848

In [10]:
all_query_results={}
top_k = 50

for qid, query in tqdm(requests.items(), desc="Processing queries"):
    q_tokens=tokenize(query)
    q_tf = Counter(q_tokens)


    dists={}
    for docid, doc_vector in ltc_vectors.items():
        # nnn → just raw term frequency, no idf, no normalization
        nnn = 0
        for term, freq in q_tf.items():
            if term in doc_vector:
                nnn += freq * doc_vector[term]
        dists[docid] = nnn

    sorted_dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
    all_query_results[qid] = sorted_dists[:top_k]  

Processing queries: 100%|██████████| 597/597 [03:22<00:00,  2.95it/s]


In [11]:
all_query_results 

{0: [('219309121', 1.3814108904644935),
  ('237563200', 0.917768406282614),
  ('33742593', 0.8812249261866014),
  ('248227350', 0.871276651982895),
  ('221995575', 0.8125881704085509),
  ('235258277', 0.7982505570093035),
  ('243865637', 0.7827143415509381),
  ('227746078', 0.7730030396549162),
  ('173990423', 0.7506076616219886),
  ('259370686', 0.7502438794599552),
  ('235294276', 0.7293827979357403),
  ('257219883', 0.7175874709588628),
  ('237433629', 0.7092411427341907),
  ('257038997', 0.6995185400781283),
  ('251439133', 0.686307002524984),
  ('53601909', 0.676353634677435),
  ('246706128', 0.6672906422772428),
  ('201670719', 0.6668137206605631),
  ('256461337', 0.6645451421394996),
  ('247741658', 0.664359257359433),
  ('22421874', 0.6524422990829981),
  ('258865530', 0.6496327046394759),
  ('8842519', 0.6449384343123928),
  ('215238664', 0.6437061327609805),
  ('235436167', 0.6412687251065421),
  ('221702970', 0.6329074808348353),
  ('258212842', 0.631247018955392),
  ('24158

In [12]:
method_name = "ltc_nnn"
version=2.0


output_file = f"outputs/{method_name}_v{version}.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid in all_query_results:
        results = all_query_results[qid]
        
        rank = 0
        for doc_id, score in results:
            rank += 1
            line = f"{qid} Q0 {doc_id} {rank} {score:.6f} {method_name}\n"
            # print(line.strip())
            out_f.write(line)
        
print(f"Results written to {output_file}")

Results written to outputs/ltc_nnn_v2.0.run


In [None]:
# python evaluate_run.py --metric map  --qrel outputs/litsearch.qrel --run outputs/ltc_nnn_v2.0.run
# Mean map: 0.1284 ± 0.0025 (n=597)
# python evaluate_run.py --metric ndcg@50  --qrel outputs/litsearch.qrel --run outputs/ltc_nnn_v2.0.run
# Mean ndcg@50: 0.2016 ± 0.0030 (n=597)