In [5]:
import os
os.environ["OPENAI_API_KEY"] = "REDACTEDproj-sgRbY5gLQ7MLNRdyk44_gOHJ2JXXx4curgglvhI1vSTRqjVqHNrsesR5vpxu5gRsv6PGB-IDM5T3BlbkFJSSDwEtP8nm-HHpKqoqiA2Njs0MCh8AEYRd4Fl4qrIzVnkZ6aFxJ_3i2Q6_AI37zzTTynuE9moA"

In [7]:
# --- Cell 1: Setup & Sanity -----------------------------------------------
import os
import json
import math
import logging
from typing import List
import numpy as np

# If you put OPENAI_API_KEY in a .env file, load it:
try:
    from dotenv import load_dotenv
    _ = load_dotenv()
except Exception:
    pass

# Basic logging (aligned with your post-4 emphasis on evaluation/logs)
logger = logging.getLogger("rag-setup")
if not logger.handlers:
    logger.setLevel(logging.INFO)
    h = logging.StreamHandler()
    h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
    logger.addHandler(h)

def ensure_dirs(paths: List[str]) -> None:
    for p in paths:
        os.makedirs(p, exist_ok=True)

ensure_dirs(["data", "indexes", "docs"])

# Validate OpenAI key availability
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY or not OPENAI_API_KEY.strip():
    raise EnvironmentError(
        "OPENAI_API_KEY is missing. Set it via os.environ or in a .env file."
    )
logger.info("OPENAI_API_KEY detected.")

# OpenAI client (new SDK)
try:
    from openai import OpenAI
except Exception as e:
    raise RuntimeError(
        "OpenAI SDK not found. Run: pip install openai==1.*"
    ) from e

client = OpenAI()  # reads API key from env by default

EMBED_MODEL = "text-embedding-3-small"  # fast, cheap, strong baseline
logger.info(f"Embedding model set to: {EMBED_MODEL}")

def embed_texts(texts: List[str]) -> np.ndarray:
    """
    Get embeddings for a list of strings using OpenAI embeddings.
    Returns a 2D numpy array [n_texts, dim].
    """
    if not isinstance(texts, list) or not texts or not all(isinstance(t, str) and t.strip() for t in texts):
        raise ValueError("texts must be a non-empty list of non-empty strings.")
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    vecs = [d.embedding for d in resp.data]
    # Normalize to unit length → inner product == cosine similarity
    arr = np.array(vecs, dtype=np.float32)
    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
    return arr / norms

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """
    Cosine similarity in [-1, 1], assuming both vectors are already normalized.
    """
    if a.ndim != 1 or b.ndim != 1 or a.shape[0] != b.shape[0]:
        raise ValueError("Inputs must be 1D vectors of the same length.")
    return float(np.dot(a, b))  # with normalized vectors, dot == cosine

logger.info("Setup helpers loaded.")


[INFO] OPENAI_API_KEY detected.
[INFO] Embedding model set to: text-embedding-3-small
[INFO] Setup helpers loaded.


In [9]:
# --- Cell 2: Embedding Smoke Test -----------------------------------------
pairs = [
    ("heart attack", "myocardial infarction"),
    ("hypertension", "high blood pressure"),
    ("antibiotic resistance", "bacteria resistant to antibiotics"),
    ("diabetes", "type 2 diabetes"),
    ("appendix", "banana")  # control: unrelated terms
]

left = [p[0] for p in pairs]
right = [p[1] for p in pairs]

# Embed both sides
L = embed_texts(left)
R = embed_texts(right)

# Compute pairwise cosine
sims = [cosine_similarity(L[i], R[i]) for i in range(len(pairs))]

results = [
    {"pair": pairs[i], "cosine_similarity": round(sims[i], 3)}
    for i in range(len(pairs))
]

print(json.dumps(results, indent=2))


[
  {
    "pair": [
      "heart attack",
      "myocardial infarction"
    ],
    "cosine_similarity": 0.591
  },
  {
    "pair": [
      "hypertension",
      "high blood pressure"
    ],
    "cosine_similarity": 0.783
  },
  {
    "pair": [
      "antibiotic resistance",
      "bacteria resistant to antibiotics"
    ],
    "cosine_similarity": 0.734
  },
  {
    "pair": [
      "diabetes",
      "type 2 diabetes"
    ],
    "cosine_similarity": 0.715
  },
  {
    "pair": [
      "appendix",
      "banana"
    ],
    "cosine_similarity": 0.257
  }
]
