In [141]:
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import requests
import os
from dotenv import load_dotenv

load_dotenv()
ED_API_KEY = os.getenv("ED_API_KEY")

In [13]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [127]:
def fetch_threads(limit=100, sort="new"):
  """
  Fetches the Ed Thread API, guaraanteeing <limit> threads are returned. The 
  Ed API sets a hard limit of 100, so if more are needed, multiple requests
  are made with different offsets.
  TODO: handle rate limiting, fetch all threads if limit is None
  """
  threads = []
  while len(threads) < limit:
    res = requests.get(
      url=f"https://us.edstem.org/api/courses/74827/threads?limit={min(100, limit - len(threads))}&offset={len(threads)}&sort={sort}",
      headers={"Authorization": f"Bearer {ED_API_KEY}"}
    )
    data = res.json()
    threads.extend(data['threads'])
    if len(data['threads']) < 100:
      break
  threads = {
    thread["id"]: 
      {
        "title": thread["title"],
        "content": clean_xml_tags(thread["content"]).strip(),
        "title_embedding": embeddings.embed_query(thread["title"]),
        "content_embedding": embeddings.embed_query(clean_xml_tags(thread["content"]).strip())
  } for thread in threads}
  return threads

def search_threads(query, limit=20, sort="relevance", category=None, from_date=None, to_date=None):
  base = {
    "query": query,
    "limit": limit,
    "sort": sort,
    "category": category,
    "from_date": from_date,
    "to_date": to_date,
  }
  params = {k: v for k, v in base.items() if v is not None}
  res = requests.get(
    url="https://us.edstem.org/api/courses/74827/threads/search",
    headers={"Authorization": f"Bearer {ED_API_KEY}"},
    params=params
  )
  threads = res.json()['threads']
  threads = {
  thread["id"]: 
    {
      "title": thread["title"],
      "content": clean_xml_tags(thread["content"]).strip(),
      "title_embedding": embeddings.embed_query(thread["title"]),
      "content_embedding": embeddings.embed_query(clean_xml_tags(thread["content"]).strip())
  } for thread in threads}
  return threads

def clean_xml_tags(text):
  import re
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [104]:
search_data = search_threads(
  query="diffusion math",
  limit=100
)
search_data

{6436163: {'title': 'posterior and prior',
  'content': 'Just to clarify, in ELBO for Diffusion and VAE, what is the posterior and what is the prior?',
  'title_embedding': [0.0011683451011776924,
   0.013599355705082417,
   -0.03595339134335518,
   -0.0567273274064064,
   -0.000997835537418723,
   0.035150282084941864,
   0.00856383703649044,
   -0.00723556661978364,
   -0.063113734126091,
   -0.020842771977186203,
   -0.038782939314842224,
   -0.01830797828733921,
   0.0350690521299839,
   -0.0559927262365818,
   0.004460328258574009,
   -0.06655038893222809,
   0.016656987369060516,
   -0.015673702582716942,
   0.04300675541162491,
   -0.007126076612621546,
   -0.027779940515756607,
   -0.01762394979596138,
   -0.03316470608115196,
   0.0063379183411598206,
   0.0496227890253067,
   0.023239681497216225,
   0.030248280614614487,
   -0.03133752942085266,
   -0.00893498957157135,
   -0.0486350916326046,
   -0.023515434935688972,
   -0.002247982658445835,
   0.00458756648004055,
   -0.

In [137]:
def find_top_k_results(q, threads, k=5):
  """
  Find the top k results from thread content based on cosine similarity to query q.
  Returns a list of (thread_id, similarity) tuples.
  """

  results = sorted(((item[0], cosine_similarity([q], [item[1]['content_embedding']])[0][0]) for item in threads.items()), key=lambda item: item[1], reverse=True)
  for thread_id, sim in results[:k]:
    print(f"Thread ID: {thread_id}, Similarity: {sim:.4f}")
    print(f"Title: {threads[thread_id]['title']}")
    print(f"Content: {threads[thread_id]['content'][:200]}...")
    print()
  return results[:k]

In [128]:
all_threads = fetch_threads(limit=600, sort="new")
len(all_threads)

583

In [138]:
q = "how does unet relate to diffusion"
results = find_top_k_results(embeddings.embed_query(q), all_threads, k=5)

Thread ID: 6424127, Similarity: 0.6824
Title: U-Net
Content: Can someone explain how diffusion uses U-Nets?...

Thread ID: 6433885, Similarity: 0.5423
Title: unconditional generation in GAN
Content: How is this different from the diffusion model?...

Thread ID: 6429171, Similarity: 0.4799
Title: Practice Prelim 2a
Content: 2a states "Has a learnable encoder" and the answer is only VAEs. Do we not consider the diffusion process to be an encoder?...

Thread ID: 6435541, Similarity: 0.4240
Title: UNets in VAEs
Content: Do VAEs use UNets?...

Thread ID: 6434498, Similarity: 0.4143
Title: GANS question
Content: When we use noise to generate an image, the generator is still employing a UNet architecture right?...



In [139]:
search = search_threads(q, limit=5)
search_sim = {thread_id: cosine_similarity([embeddings.embed_query(q)], [data['content_embedding']])[0][0] for thread_id, data in search.items()}
for thread_id, sim in search_sim.items():
  print(f"Thread ID: {thread_id}, Similarity: {sim:.4f}")
  print(f"Title: {search[thread_id]['title']}")
  print(f"Content: {search[thread_id]['content'][:200]}...")
  print()

Thread ID: 6424127, Similarity: 0.6824
Title: U-Net
Content: Can someone explain how diffusion uses U-Nets?...

Thread ID: 6436163, Similarity: 0.3941
Title: posterior and prior
Content: Just to clarify, in ELBO for Diffusion and VAE, what is the posterior and what is the prior?...

Thread ID: 6428501, Similarity: 0.3288
Title: Cycle Consistency
Content: Which of GANs, VAE, Diffusion have cycle consistency?...

Thread ID: 6433885, Similarity: 0.5423
Title: unconditional generation in GAN
Content: How is this different from the diffusion model?...

Thread ID: 6595774, Similarity: 0.1306
Title: P5 ddim_sample
Content: I am able to run all of P5 and the results I get look solid, but when submitting to gradescope I keep failing the ddim_sample tests that all sayTest Failed: Gaussian Diffusion ddim sample failed: Fals...

