# Embedding pipeline demo

Generate synthetic data, (optionally) extract or simulate embeddings, run nearest-neighbor retrieval.

In [None]:
import sys
from pathlib import Path
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(ROOT))

from data.synth import generate_synthetic
import numpy as np

In [None]:
# Synthetic cases with embeddings
records = generate_synthetic(50, seed=42, include_embedding=True, embedding_dim=256)
embeddings = np.array([r["embedding"] for r in records], dtype=np.float32)
print(embeddings.shape)

In [None]:
# L2-normalize (synth already normalized)
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
norms = np.where(norms > 0, norms, 1.0)
embeddings = embeddings / norms

In [None]:
# Nearest-neighbor retrieval with FAISS (or plain numpy)
try:
    import faiss
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    query = embeddings[:1]
    D, I = index.search(query, k=3)
    print("Top-3 indices:", I[0], "Distances:", D[0])
except ImportError:
    dists = np.linalg.norm(embeddings - embeddings[0:1], axis=1)
    top3 = np.argsort(dists)[:3]
    print("Top-3 indices (numpy):", top3)