# Data 

Download from here: https://nlp.stanford.edu/data/glove.6B.zip (https://nlp.stanford.edu/projects/glove/)

Unzip it into `../artifacts/glove.6B/`

# GloVe ANN benchmark (base / queries / ground truth)

This notebook loads Stanford GloVe embeddings and constructs a simple ANN benchmark for cosine (angular) similarity.

- **Parsing:** the file `glove.6B.*d.txt` is parsed into a token list `words` and an embedding matrix $E \in \mathbb{R}^{M \times D}$, where each row corresponds to a word vector.
- **Base and queries:** the search base is defined as the first $N$ vectors $X \in \mathbb{R}^{N \times D}$, and the query set is formed by sampling a random subset of rows from $X$ using indices $q_{\text{idx}}$: $Q = X[q_{\text{idx}}] \in \mathbb{R}^{n_q \times D}$.
- **Cosine / angular setting:** vectors are L2-normalized $X_n = \frac{X}{\|X\|_2}$ and $Q_n = \frac{Q}{\|Q\|_2}$, so cosine similarity reduces to a dot product $s(q, x) = \frac{q^\top x}{\|q\|_2\|x\|_2} = q_n^\top x_n$.
- **Ground truth (exact top-$k$):** exact neighbors are computed via batched matrix multiplication $S = Q_n X_n^\top$, excluding the trivial self-match (since queries are drawn from the base). The benchmark stores $\text{gt\_ids} \in \mathbb{N}^{n_q \times k}$ (top-$k$ neighbor indices) and $\text{gt\_scores} \in \mathbb{R}^{n_q \times k}$ (corresponding cosine similarity values).


In [1]:
# %% [markdown]
# # 0_data â€” Build GloVe ANN benchmark artifacts (base / queries / ground truth)
# Produces comparable datasets across dimensions with a shared query split and a shared dataset_id.

# %%
import json
import hashlib
from pathlib import Path
from typing import List, Optional, Tuple

import numpy as np
from tqdm.auto import tqdm

# ----------------------------
# Paths / config
# ----------------------------
REPO_ROOT = Path("..").resolve()
ARTIFACTS = (REPO_ROOT / "artifacts").resolve()
OUT_DIR = (ARTIFACTS / "data").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

GLOVE_DIR = (ARTIFACTS / "glove.6B").resolve()
GLOVE_DIMS = [50, 100, 200, 300]

N_BASE = 400_000
NQ_TOTAL = 100_000
NQ_TEST = 20_000
TOP_K = 50
SEED = 42

X_BATCH = 5_000
Q_BATCH = 1_024

# ----------------------------
# Utilities
# ----------------------------
def l2_normalize(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    X = X.astype(np.float32, copy=False)
    n = np.linalg.norm(X, axis=1, keepdims=True)
    return X / np.maximum(n, eps)

def _topk_desc_1d(a: np.ndarray, k: int) -> np.ndarray:
    k = int(k)
    if k <= 0:
        return np.empty((0,), dtype=np.int64)
    if k >= a.size:
        return np.argsort(-a)
    idx = np.argpartition(-a, kth=k - 1)[:k]
    return idx[np.argsort(-a[idx])]

def compute_dataset_id(
    n_base: int,
    seed: int,
    q_idx_all: np.ndarray,
    q_idx_test: np.ndarray,
    version: str = "v1",
) -> str:
    h = hashlib.sha256()
    h.update(version.encode("utf-8"))
    h.update(str(int(n_base)).encode("utf-8"))
    h.update(str(int(seed)).encode("utf-8"))
    h.update(np.ascontiguousarray(q_idx_all, dtype=np.int32).tobytes())
    h.update(np.ascontiguousarray(q_idx_test, dtype=np.int32).tobytes())
    return h.hexdigest()[:16]

def hash_tokens(tokens: List[str]) -> str:
    h = hashlib.sha256()
    for t in tokens:
        h.update(t.encode("utf-8", errors="replace"))
        h.update(b"\n")
    return h.hexdigest()

# ----------------------------
# Fast GloVe loader
# ----------------------------
def load_glove_fast(path: str | Path, max_rows: Optional[int] = None) -> Tuple[List[str], np.ndarray]:
    words: List[str] = []
    vecs: List[np.ndarray] = []
    dim: Optional[int] = None

    path = str(path)
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f):
            if max_rows is not None and i >= int(max_rows):
                break
            parts = line.rstrip().split(" ")
            if len(parts) < 2:
                continue
            w = parts[0]
            v = np.asarray(parts[1:], dtype=np.float32)
            if dim is None:
                dim = int(v.size)
            if v.size != dim:
                continue
            words.append(w)
            vecs.append(v)

    if dim is None:
        raise RuntimeError(f"Failed to infer dim from: {path}")

    E = np.vstack(vecs).astype(np.float32, copy=False)
    return words, E

# ----------------------------
# Exact ground truth (cosine/IP on unit vectors)
# ----------------------------
def ground_truth_topk_cosine(
    Xn: np.ndarray,
    Qn: np.ndarray,
    k: int,
    exclude_ids: Optional[np.ndarray] = None,  # shape (nq,) or None
    x_batch: int = 5_000,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Exact top-k neighbors for cosine similarity on unit-normalized vectors.
    Returns (gt_ids, gt_scores) with shapes (nq, k).
    """
    Xn = np.ascontiguousarray(Xn, dtype=np.float32)
    Qn = np.ascontiguousarray(Qn, dtype=np.float32)

    nq = int(Qn.shape[0])
    k = int(k)
    gt_ids = np.full((nq, k), -1, dtype=np.int32)
    gt_sc = np.full((nq, k), -np.inf, dtype=np.float32)

    for start in tqdm(range(0, Xn.shape[0], int(x_batch)), desc="GT over X batches"):
        end = min(Xn.shape[0], start + int(x_batch))
        Xb = Xn[start:end]  # (b, d)

        sims = Qn @ Xb.T  # (nq, b)

        if exclude_ids is not None:
            # Exclude self-match when queries are subset of base: if exclude_id in [start, end).
            ex = exclude_ids.astype(np.int64, copy=False)
            m = (ex >= start) & (ex < end)
            if np.any(m):
                rows = np.nonzero(m)[0]
                cols = (ex[m] - start).astype(np.int64, copy=False)
                sims[rows, cols] = -np.inf

        # Merge current batch top-k with running top-k
        b = sims.shape[1]
        kk = min(k, b)

        idx_local = np.argpartition(-sims, kth=kk - 1, axis=1)[:, :kk]  # (nq, kk)
        sc_local = np.take_along_axis(sims, idx_local, axis=1)          # (nq, kk)
        ids_local = (idx_local + start).astype(np.int32, copy=False)    # global ids

        ids_merge = np.concatenate([gt_ids, ids_local], axis=1)         # (nq, k+kk)
        sc_merge = np.concatenate([gt_sc, sc_local], axis=1)            # (nq, k+kk)

        # Keep best k from merged
        idx_keep = np.argpartition(-sc_merge, kth=k - 1, axis=1)[:, :k]  # (nq, k)
        sc_keep = np.take_along_axis(sc_merge, idx_keep, axis=1)
        ids_keep = np.take_along_axis(ids_merge, idx_keep, axis=1)

        # Sort final k descending for cleanliness
        order = np.argsort(-sc_keep, axis=1)
        gt_sc = np.take_along_axis(sc_keep, order, axis=1).astype(np.float32, copy=False)
        gt_ids = np.take_along_axis(ids_keep, order, axis=1).astype(np.int32, copy=False)

    return gt_ids, gt_sc

# ----------------------------
# Shared query ids for all dims
# ----------------------------
rng = np.random.default_rng(SEED)
q_idx_all = rng.choice(N_BASE, size=NQ_TOTAL, replace=False).astype(np.int32)

# Deterministic split (stable across reruns)
q_idx_test = q_idx_all[:NQ_TEST].copy()
q_idx_train = q_idx_all[NQ_TEST:].copy()

DATASET_ID = compute_dataset_id(N_BASE, SEED, q_idx_all, q_idx_test)
print("DATASET_ID:", DATASET_ID)

# ----------------------------
# Build artifacts per dim
# ----------------------------
master_words_hash: Optional[str] = None
master_words_sample: Optional[list[str]] = None

for d in GLOVE_DIMS:
    print(f"\n=== Processing {d}d ===")
    glove_path = (GLOVE_DIR / f"glove.6B.{d}d.txt").resolve()
    words, E = load_glove_fast(glove_path, max_rows=N_BASE)

    base_words = words[:N_BASE]
    X = E[:N_BASE].astype(np.float32, copy=False)
    Xn = l2_normalize(X)

    # Strict vocab alignment across dims
    words_hash = hash_tokens(base_words)
    if master_words_hash is None:
        master_words_hash = words_hash
        master_words_sample = [base_words[i] for i in (0, 1, 2, 10, 123, 999, N_BASE - 1)]
    else:
        if words_hash != master_words_hash:
            raise RuntimeError(
                f"Vocabulary order mismatch across dims. "
                f"Expected hash={master_words_hash}, got hash={words_hash} for dim={d}."
            )
        # cheap sanity
        sample = [base_words[i] for i in (0, 1, 2, 10, 123, 999, N_BASE - 1)]
        if sample != master_words_sample:
            raise RuntimeError(f"Vocabulary sample mismatch across dims for dim={d}.")

    Q_train = X[q_idx_train]
    Q_test = X[q_idx_test]
    Qn_train = Xn[q_idx_train]
    Qn_test = Xn[q_idx_test]

    gt_ids, gt_scores = ground_truth_topk_cosine(
        Xn=Xn,
        Qn=Qn_test,
        k=TOP_K,
        exclude_ids=q_idx_test,      # exclude exact self-match
        x_batch=X_BATCH,
    )

    meta = {
        "dataset_id": DATASET_ID,
        "glove_path": str(glove_path),
        "dim": int(d),
        "N_BASE": int(N_BASE),
        "NQ_TOTAL": int(NQ_TOTAL),
        "NQ_TRAIN": int(q_idx_train.size),
        "NQ_TEST": int(q_idx_test.size),
        "TOP_K_TEST": int(TOP_K),
        "SEED": int(SEED),
        "metric": "cosine/ip (unit-normalized)",
        "base_words_sha256": master_words_hash,
        "notes": "Shared q_idx_train/q_idx_test reused across dimensions; self-match excluded in GT.",
    }

    base_out = OUT_DIR / f"glove{d}d_base_N{N_BASE}__ds{DATASET_ID}.npz"
    queries_out = OUT_DIR / f"glove{d}d_queries_N{N_BASE}_nq{NQ_TOTAL}_test{NQ_TEST}_seed{SEED}__ds{DATASET_ID}.npz"
    gt_out = OUT_DIR / f"glove{d}d_gt_test_N{N_BASE}_nq{NQ_TEST}_k{TOP_K}_seed{SEED}__ds{DATASET_ID}.npz"

    np.savez_compressed(
        base_out,
        dataset_id=np.array(DATASET_ID, dtype=np.str_),
        base_words=np.array(base_words, dtype=np.str_),
        X=X,
        Xn=Xn,
        meta=np.array(json.dumps(meta), dtype=np.str_),
    )
    np.savez_compressed(
        queries_out,
        dataset_id=np.array(DATASET_ID, dtype=np.str_),
        q_idx_all=q_idx_all,
        q_idx_train=q_idx_train,
        q_idx_test=q_idx_test,
        Q_train=Q_train,
        Q_test=Q_test,
        Qn_train=Qn_train,
        Qn_test=Qn_test,
        meta=np.array(json.dumps(meta), dtype=np.str_),
    )
    np.savez_compressed(
        gt_out,
        dataset_id=np.array(DATASET_ID, dtype=np.str_),
        q_idx_test=q_idx_test,
        gt_ids=gt_ids,
        gt_scores=gt_scores,
        meta=np.array(json.dumps(meta), dtype=np.str_),
    )

    print("Saved:")
    print("-", base_out.name)
    print("-", queries_out.name)
    print("-", gt_out.name)

print("\nDone. DATASET_ID:", DATASET_ID)


DATASET_ID: 989456bc6cc2e6ed

=== Processing 50d ===


GT over X batches:   0%|          | 0/80 [00:00<?, ?it/s]

Saved:
- glove50d_base_N400000__ds989456bc6cc2e6ed.npz
- glove50d_queries_N400000_nq100000_test20000_seed42__ds989456bc6cc2e6ed.npz
- glove50d_gt_test_N400000_nq20000_k50_seed42__ds989456bc6cc2e6ed.npz

=== Processing 100d ===


GT over X batches:   0%|          | 0/80 [00:00<?, ?it/s]

Saved:
- glove100d_base_N400000__ds989456bc6cc2e6ed.npz
- glove100d_queries_N400000_nq100000_test20000_seed42__ds989456bc6cc2e6ed.npz
- glove100d_gt_test_N400000_nq20000_k50_seed42__ds989456bc6cc2e6ed.npz

=== Processing 200d ===


GT over X batches:   0%|          | 0/80 [00:00<?, ?it/s]

Saved:
- glove200d_base_N400000__ds989456bc6cc2e6ed.npz
- glove200d_queries_N400000_nq100000_test20000_seed42__ds989456bc6cc2e6ed.npz
- glove200d_gt_test_N400000_nq20000_k50_seed42__ds989456bc6cc2e6ed.npz

=== Processing 300d ===


GT over X batches:   0%|          | 0/80 [00:00<?, ?it/s]

Saved:
- glove300d_base_N400000__ds989456bc6cc2e6ed.npz
- glove300d_queries_N400000_nq100000_test20000_seed42__ds989456bc6cc2e6ed.npz
- glove300d_gt_test_N400000_nq20000_k50_seed42__ds989456bc6cc2e6ed.npz

Done. DATASET_ID: 989456bc6cc2e6ed
