In [1]:
import math
from functools import lru_cache
from datasets import load_dataset
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [2]:

neu_wiki = load_dataset("nuprl/engineering-llm-systems", name="wikipedia-northeastern-university", split="test")
obscure_questions = load_dataset("nuprl/engineering-llm-systems", name="obscure_questions", split="test")


test-00000-of-00001.parquet:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating test split:   0%|          | 0/2434 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

tiny-00000-of-00001.parquet:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/931 [00:00<?, ? examples/s]

Generating tiny split:   0%|          | 0/50 [00:00<?, ? examples/s]

In [3]:
neu_wiki[0]['url']

'https://en.wikipedia.org/wiki/British%20Columbia'

In [4]:
obscure_questions[0]

{'url': '',
 'article_id': '6849',
 'prompt': 'In what year was Cy Young elected to the National Baseball Hall of Fame?',
 'choices': ['A. 1935', 'B. 1937', 'C. 1940', 'D. 1956'],
 'correct_answer': 'B',
 'id': 0}

In [5]:
def term_frequency(document: str, term: str):
    c = document.count(term)
    return 0 if c == 0 else 1 + math.log(c)
    # return document.count(term)

@lru_cache(maxsize=None)
def inverse_document_frequency(term: str):
    num_docs_with_term = sum(1 for item in neu_wiki if term in item["text"])
    return math.log(len(neu_wiki) / (1 + num_docs_with_term))

def compute_tf_idf_vector_unnormalized(terms, document: str):
    return [ term_frequency(document, term) * inverse_document_frequency(term) for term in terms ]

def compute_tf_idf_vector(terms, document: str):
    vec = compute_tf_idf_vector_unnormalized(terms, document)
    return vec

def compute_cosine_similarity(vec1, vec2):
    vec1_norm = np.linalg.norm(vec1)
    vec2_norm = np.linalg.norm(vec2)

    if vec1_norm == 0 or vec2_norm == 0:
        return 0
    
    return np.dot(vec1, vec2) / (vec1_norm * vec2_norm)

def rank_by_tf_idf(query: str):
    query_vec = compute_tf_idf_vector(query.split(), query)
    return sorted(neu_wiki, key=lambda x: compute_cosine_similarity(query_vec, compute_tf_idf_vector(query.split(), x["text"])), reverse=True)

In [6]:
neu_docs = rank_by_tf_idf("Northeastern")
for item in neu_docs[:20]:
    print(item["title"], item["url"])


British Columbia https://en.wikipedia.org/wiki/British%20Columbia
Cy Young https://en.wikipedia.org/wiki/Cy%20Young
Car Talk https://en.wikipedia.org/wiki/Car%20Talk
Dartmouth College https://en.wikipedia.org/wiki/Dartmouth%20College
Dedham, Massachusetts https://en.wikipedia.org/wiki/Dedham%2C%20Massachusetts
Derek Walcott https://en.wikipedia.org/wiki/Derek%20Walcott
Distance education https://en.wikipedia.org/wiki/Distance%20education
Eindhoven University of Technology https://en.wikipedia.org/wiki/Eindhoven%20University%20of%20Technology
Fenway Park https://en.wikipedia.org/wiki/Fenway%20Park
Ice hockey https://en.wikipedia.org/wiki/Ice%20hockey
Massachusetts Institute of Technology https://en.wikipedia.org/wiki/Massachusetts%20Institute%20of%20Technology
Nu https://en.wikipedia.org/wiki/Nu
Susan B. Anthony https://en.wikipedia.org/wiki/Susan%20B.%20Anthony
Scheme (programming language) https://en.wikipedia.org/wiki/Scheme%20%28programming%20language%29
Siberian Husky https://en.wi

In [7]:
model = AutoModel.from_pretrained("answerdotai/ModernBERT-base")
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")