In [3]:
import math
from functools import lru_cache
from datasets import load_dataset
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI

In [4]:

neu_wiki = load_dataset("nuprl/engineering-llm-systems", name="wikipedia-northeastern-university", split="test")
obscure_questions = load_dataset("nuprl/engineering-llm-systems", name="obscure_questions", split="test")


In [3]:
neu_wiki[0]['url']

'https://en.wikipedia.org/wiki/British%20Columbia'

In [4]:
obscure_questions[0]

{'url': '',
 'article_id': '6849',
 'prompt': 'In what year was Cy Young elected to the National Baseball Hall of Fame?',
 'choices': ['A. 1935', 'B. 1937', 'C. 1940', 'D. 1956'],
 'correct_answer': 'B',
 'id': 0}

In [5]:
def term_frequency(document: str, term: str):
    c = document.count(term)
    return 0 if c == 0 else 1 + math.log(c)
    # return document.count(term)

@lru_cache(maxsize=None)
def inverse_document_frequency(term: str):
    num_docs_with_term = sum(1 for item in neu_wiki if term in item["text"])
    return math.log(len(neu_wiki) / (1 + num_docs_with_term))

def compute_tf_idf_vector_unnormalized(terms, document: str):
    return [ term_frequency(document, term) * inverse_document_frequency(term) for term in terms ]

def compute_tf_idf_vector(terms, document: str):
    vec = compute_tf_idf_vector_unnormalized(terms, document)
    return vec

def compute_cosine_similarity(vec1, vec2):
    vec1_norm = np.linalg.norm(vec1)
    vec2_norm = np.linalg.norm(vec2)

    if vec1_norm == 0 or vec2_norm == 0:
        return 0
    
    return np.dot(vec1, vec2) / (vec1_norm * vec2_norm)

def rank_by_tf_idf(query: str):
    query_vec = compute_tf_idf_vector(query.split(), query)
    return sorted(neu_wiki, key=lambda x: compute_cosine_similarity(query_vec, compute_tf_idf_vector(query.split(), x["text"])), reverse=True)

In [10]:
neu_docs = rank_by_tf_idf("Northeastern")
for item in neu_docs[:20]:
    print(item["title"], item["url"])


British Columbia https://en.wikipedia.org/wiki/British%20Columbia
Cy Young https://en.wikipedia.org/wiki/Cy%20Young
Car Talk https://en.wikipedia.org/wiki/Car%20Talk
Dartmouth College https://en.wikipedia.org/wiki/Dartmouth%20College
Dedham, Massachusetts https://en.wikipedia.org/wiki/Dedham%2C%20Massachusetts
Derek Walcott https://en.wikipedia.org/wiki/Derek%20Walcott
Distance education https://en.wikipedia.org/wiki/Distance%20education
Eindhoven University of Technology https://en.wikipedia.org/wiki/Eindhoven%20University%20of%20Technology
Fenway Park https://en.wikipedia.org/wiki/Fenway%20Park
Ice hockey https://en.wikipedia.org/wiki/Ice%20hockey
Massachusetts Institute of Technology https://en.wikipedia.org/wiki/Massachusetts%20Institute%20of%20Technology
Nu https://en.wikipedia.org/wiki/Nu
Susan B. Anthony https://en.wikipedia.org/wiki/Susan%20B.%20Anthony
Scheme (programming language) https://en.wikipedia.org/wiki/Scheme%20%28programming%20language%29
Siberian Husky https://en.wi

In [19]:
model = AutoModel.from_pretrained("answerdotai/ModernBERT-base")
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

In [21]:
import heapq

n = 2  # Number of top documents to keep track of
top_docs = []  # Min-heap to store top n documents
question = obscure_questions[0]

with torch.no_grad():
    query_vec = model(**tokenizer(question['prompt'], return_tensors="pt")).last_hidden_state[0, 0]
    
    for doc in neu_docs[:2]:
        doc_vec = model(**tokenizer(doc["text"], return_tensors="pt", truncation=True)).last_hidden_state[0, 0]
        cosine_sim = compute_cosine_similarity(query_vec.numpy(), doc_vec.numpy())
        
        # Maintain a min-heap of top n documents
        if len(top_docs) < n:
            heapq.heappush(top_docs, (cosine_sim, doc))
        else:
            heapq.heappushpop(top_docs, (cosine_sim, doc))

# Sort in descending order based on similarity
sorted_top_docs = sorted(top_docs, key=lambda x: x[0], reverse=True)

print("Top documents:")
for sim, doc in sorted_top_docs:
    print(f"Similarity: {sim:.4f}")
    print(doc)

Top documents:
Similarity: 0.8184
{'id': '6849', 'url': 'https://en.wikipedia.org/wiki/Cy%20Young', 'title': 'Cy Young', 'text': 'Denton True "Cy" Young (March 29, 1867 – November 4, 1955) was an American Major League Baseball (MLB) pitcher. Born in Gilmore, Ohio, he worked on his family\'s farm as a youth before starting his professional baseball career. Young entered the major leagues in 1890 with the National League\'s Cleveland Spiders and pitched for them until 1898. He was then transferred to the St. Louis Cardinals franchise. In 1901, Young jumped to the American League and played for the Boston Red Sox franchise until 1908, helping them win the 1903 World Series. He finished his career with the Cleveland Naps and Boston Rustlers, retiring in 1911.\n\nYoung was one of the hardest-throwing pitchers in the game early in his career. After his speed diminished, he relied more on his control and remained effective into his forties. By the time Young retired, he had established numero