### TF-IDF From Scratch

In [31]:
from datasets import load_from_disk
import math 
import numpy as np
from tqdm import tqdm
from collections import Counter
import json 

In [2]:
# !pip install datasets

In [3]:
# from datasets import load_dataset
# import os

# configs = ["query", "corpus_clean", "corpus_s2orc"]


# # Loop through and download each if not already saved
# for config in configs:
#     save_path = f"LitSearch_{config}"
    
#     if os.path.exists(save_path):
#         print(f" {config} already downloaded at {save_path}, skipping.\n")
#         continue  # Skip download if directory already exists
    
#     print(f" Downloading configuration: {config}")
#     dataset = load_dataset("princeton-nlp/LitSearch", config)
#     dataset.save_to_disk(save_path)
#     print(f" Saved {config} to {save_path}\n")

### Load the dataset and make corpus
* corpus: dict of document_id -> list of tokens

In [4]:
data = load_from_disk("LitSearch_corpus_clean")
data['full'].shape

(64183, 5)

In [5]:
data['full'] 

Dataset({
    features: ['corpusid', 'title', 'abstract', 'citations', 'full_paper'],
    num_rows: 64183
})

In [6]:
documents={}    
for item in tqdm(data["full"], desc="Processing documents"):
    doc_id = str(item["corpusid"])
    contents = (item["title"] or "") + " " + (item["abstract"] or "")
    documents[doc_id] = contents

Processing documents: 100%|██████████| 64183/64183 [00:03<00:00, 18111.31it/s]


In [7]:
documents['252715594']

'PHENAKI: VARIABLE LENGTH VIDEO GENERATION FROM OPEN DOMAIN TEXTUAL DESCRIPTIONS We present Phenaki, a model capable of realistic video synthesis, given a sequence of textual prompts. Generating videos from text is particularly challenging due to the computational cost, limited quantities of high quality text-video data and variable length of videos. To address these issues, we introduce a new model for learning video representation which compresses the video to a small representation of discrete tokens. This tokenizer uses causal attention in time, which allows it to work with variable-length videos. To generate video tokens from text we are using a bidirectional masked transformer conditioned on pre-computed text tokens. The generated video tokens are subsequently de-tokenized to create the actual video. To address data issues, we demonstrate how joint training on a large corpus of image-text pairs as well as a smaller number of video-text examples can result in generalization beyond

In [8]:
corpus = dict() # docid -> list of words for the document
for doc_id, text in documents.items():
    corpus[doc_id] = text.lower().split()
print(f"#documents: {len(corpus)}")

#documents: 64183


In [9]:
lengths= [len(corpus[x]) for x in corpus]
print(f"#average document length: {sum(lengths)/len(lengths):.2f} words.")
all_words=set()
for x in corpus:
    all_words.update(corpus[x])

unique_words= list(set(all_words))
print(f"#unique words: {len(unique_words)}")

#average document length: 131.95 words.
#unique words: 296439


In [10]:
lengths= [len(corpus[x]) for x in corpus]
all_words=set()
for x in corpus:
    all_words.update(corpus[x])

stats={}
stats['doc_size']=len(corpus)  # number of documents
stats['vocab_size']=len(all_words) # vocabulary size
stats['mean_dl']=sum(lengths)/len(lengths) # average document length
stats['max_dl']=max(lengths) # maximum document length
stats['min_dl']=min(lengths) # minimum document length

# stats['doc_lengths']={docid: len(corpus[docid]) for docid in corpus}

In [11]:
stats

{'doc_size': 64183,
 'vocab_size': 296439,
 'mean_dl': 131.94582677656078,
 'max_dl': 3347,
 'min_dl': 0}

In [12]:
index_map = {w: i for i, w in enumerate(unique_words)}

### Make Inverted Index
* inverted_index: dict of word -> list of document_ids with frequency containing the word

In [13]:
def make_inverted_index(corpus):
    """ 
    a word can be found in which documents and how many times.
    corpus: {docid -> list of words for the document}
    inverted_index: {word -> {docid: frequency of the word in docid}}
    """
    inverted_index = dict() # word -> {docid: frequency in docid}
    for docid in tqdm(corpus, desc="Building inverted index"):
        words = corpus[docid]
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = dict()
            if docid not in inverted_index[word]:
                inverted_index[word][docid] = 0
            inverted_index[word][docid] += 1
    return inverted_index

In [14]:
inverted_index = make_inverted_index(corpus)
len(inverted_index), list(inverted_index.keys())[:10]

Building inverted index:   0%|          | 0/64183 [00:00<?, ?it/s]

Building inverted index: 100%|██████████| 64183/64183 [00:02<00:00, 25088.63it/s]


(296439,
 ['phenaki:',
  'variable',
  'length',
  'video',
  'generation',
  'from',
  'open',
  'domain',
  'textual',
  'descriptions'])

In [15]:
# docids=list( inverted_index['the'].keys() )
# len(docids), docids[:10]

### TF-IDF Calculation

In [16]:
def calc_term_frequency(document_tokens, vocab_size):
    """ 
    for each word, how important it is in this document. (#time it appears in this doc / total #words in this doc)
    """
    tf=np.zeros(vocab_size, dtype=float)

    doc_len = len(document_tokens)
    if doc_len == 0:
        return tf 

    counts = Counter(document_tokens) 
    for tok, c in counts.items():
        idx = index_map.get(tok)
        if idx is not None:
            tf[idx] = c / doc_len
    return tf


In [17]:
def calc_inverse_document_frequency(document_tokens, inverted_index, document_size, vocab_size):
    """ 
    for each word, how important it is in the corpus. (#document contains it / total #documents)
    """ 
    idf_vector = np.zeros( vocab_size, dtype=float )
    for token in document_tokens:
        how_many_docs_contain_this_token = len( inverted_index[token] ) 
        token_index = index_map.get(token)
        idf_value = np.log( (document_size+1) / (how_many_docs_contain_this_token + 1.0) )
        idf_vector[token_index] = idf_value
    return idf_vector

In [18]:
vocab_size= stats['vocab_size']
document_size= stats['doc_size']

In [19]:
# document_id = '252715594' 
# tokens = corpus[document_id] 
# tf_vector = calc_term_frequency(tokens, vocab_size)
# idf_vector = calc_inverse_document_frequency(tokens, inverted_index, document_size, vocab_size)
# tf_idf = tf_vector * idf_vector
# tf_idf

### calculate tf-idf vector for all the documents in the corpus
* require 155GB RAM!

In [20]:
tf_idfs={}
for docid in tqdm(corpus, desc="Calculating TF-IDF for documents"):
    document_tokens = corpus[docid]
    tf_vector = calc_term_frequency(document_tokens, vocab_size)
    idf_vector = calc_inverse_document_frequency(document_tokens, inverted_index, document_size, vocab_size)
    tf_idfs[docid] = tf_vector * idf_vector 

Calculating TF-IDF for documents: 100%|██████████| 64183/64183 [01:19<00:00, 810.33it/s]


### Load query documents

In [21]:
dataset_query = load_from_disk("LitSearch_query")

query_0 = dataset_query["full"][0]["query"]
query_1 = dataset_query["full"][1]["query"]

requests = {0: query_0, 1: query_1}

In [22]:
requests 

{0: 'Are there any research papers on methods to compress large-scale language models using task-agnostic knowledge distillation techniques?',
 1: 'Are there any resources available for translating Tunisian Arabic dialect that contain both manually translated comments by native speakers and additional data augmented through methods like segmentation at stop words level?'}

### calculate tf-idf for each query

In [23]:
#calculate tf-idf for the query

query_tf_idfs={}
top_k = 3

for qid in requests:
    query_tokens = requests[qid].lower().split()

    tf_vector = calc_term_frequency(query_tokens, vocab_size)
    idf_vector = calc_inverse_document_frequency(query_tokens, inverted_index, document_size, vocab_size)
    query_tf_idf = tf_vector * idf_vector
    query_tf_idfs[qid] = query_tf_idf

### Calculate cosine distance for each query and rank documents

In [26]:
all_query_results = {}

top_k = 50

for qid in query_tf_idfs:
    query_tf_idf = query_tf_idfs[qid]

    query_result = dict() # docid -> score
    #now calculate cosine similarity between the query and each document
    for docid in tqdm(tf_idfs):
        doc_tf_idf = tf_idfs[docid]
        dot_product = np.dot(query_tf_idf.T, doc_tf_idf) 
        query_norm = np.linalg.norm(query_tf_idf)
        doc_norm = np.linalg.norm(doc_tf_idf)
        if query_norm == 0 or doc_norm == 0:
            cosine_similarity = 0.0
        else:
            cosine_similarity = dot_product / (query_norm * doc_norm)
        query_result[docid] = cosine_similarity

    query_result_sorted = sorted(query_result.items(), key=lambda x: x[1], reverse=True)
    all_query_results[qid] = query_result_sorted[:top_k]

100%|██████████| 64183/64183 [00:02<00:00, 22481.60it/s]
100%|██████████| 64183/64183 [00:02<00:00, 22934.05it/s]


In [27]:
all_query_results 

{0: [('257038997', 0.27946243631698464),
  ('259137871', 0.25003909563714743),
  ('11743245', 0.20766365431527253),
  ('235258277', 0.1938886061467414),
  ('201670719', 0.19212080978400298),
  ('215238664', 0.1891933070023115),
  ('218502458', 0.18135526472169056),
  ('221995575', 0.17857260379122605),
  ('258865530', 0.16888846091192222),
  ('256461337', 0.16026205980247404),
  ('8451212', 0.15753114231005344),
  ('227231666', 0.15437580370964962),
  ('222290473', 0.15291581010282185),
  ('241583522', 0.15280084750910733),
  ('237563200', 0.15014159265579255),
  ('235652233', 0.14962857723906986),
  ('253107373', 0.14834680466046865),
  ('3960960', 0.1468440997957143),
  ('222132977', 0.14419822337178312),
  ('226284011', 0.14379230484233985),
  ('259370760', 0.14377171107605005),
  ('256461264', 0.143290963771035),
  ('248780060', 0.1428707623746974),
  ('250390701', 0.1420915168716719),
  ('256461166', 0.1420206915048201),
  ('233307448', 0.1417215047032746),
  ('259370551', 0.14002

### save in .run file format

In [30]:
method_name = "tfidf_basic"
request_id = 0


output_file = "litsearch_topk_results_tfidf_basic.run"
with open(output_file, "w", encoding="utf-8") as out_f:
    for qid in all_query_results:
        results = all_query_results[qid]
        request_id += 1
        rank = 0
        for doc_id, score in results:
            rank += 1
            line = f"{request_id} Q0 {doc_id} {rank} {score:.6f} {method_name}\n"
            # print(line.strip())
            out_f.write(line)
        
    

### save in json format

In [36]:
method_name = "tfidf_basic"
request_id = 0


output_file = "litsearch_top3_results_tfidf_basic.jsonl"


with open(output_file, "w", encoding="utf-8") as out_f:
    for qid in all_query_results:
        results = all_query_results[qid]
        request_id += 1
        query_text = requests[qid]
        top3=[]
        for rank, (doc_id, score) in enumerate(results[:3]):
            # print(f"Rank {rank+1}: DocID={doc_id}, Score={score:.6f}")
            rd = {"doc_id": doc_id, "score": score, "rank": rank + 1}
            top3.append(rd)
        

        out_f.write(json.dumps({
            "qid": qid,
            "query": query_text,
            "top3": top3
        }) + "\n")
        