In [None]:
# !pip install -q sentence-transformers scikit-learn numpy pandas

import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

# ---- ENV/paths (Gradient notebooks) ----
OUT_DIR = Path("/storage/ai")  # persistent volume in Paperspace
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "BAAI/bge-base-en-v1.5"
USE_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if USE_CUDA else "cpu"

# Bigger batches on GPU; smaller on CPU
BATCH_SIZE = 512 if USE_CUDA else 64

print("Device:", DEVICE, "| GPU name:", torch.cuda.get_device_name(0) if USE_CUDA else "CPU")

# ---- Load data ----
df = pd.read_json(OUT_DIR / "articles_df.json", orient="records")
df["Title"] = df["Title"].fillna("").astype(str)
df["Summary"] = df["Summary"].fillna("").astype(str)
texts = (df["Title"] + " " + df["Summary"]).tolist()
ids = df["ID"].astype(str).tolist()

# ---- Load model on GPU ----
model = SentenceTransformer(MODEL_NAME, device=DEVICE)

# Optional: slight speed-up on GPU by enabling fp16 autocast during encode
# SentenceTransformers will use the device; autocast here makes matmuls use fp16
def encode_in_batches(texts, batch_size=256):
    embs_list = []
    model.eval()
    with torch.no_grad():
        # autocast only if CUDA available
        ctx = torch.cuda.amp.autocast(dtype=torch.float16) if USE_CUDA else torch.no_grad()
        with ctx:
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                emb = model.encode(
                    batch,
                    batch_size=len(batch),          # we already batch manually
                    convert_to_numpy=True,
                    normalize_embeddings=False,
                    show_progress_bar=False,
                )
                embs_list.append(emb)
    return np.vstack(embs_list)

# ---- Encode ----
embs = encode_in_batches(texts, BATCH_SIZE)
# L2-normalize for cosine
embs = normalize(embs, norm="l2", copy=False).astype(np.float32)

# ---- Save artifacts (dense) ----
np.save(OUT_DIR / "bge_embeddings.npy", embs)
with open(OUT_DIR / "ids_by_row.json", "w", encoding="utf-8") as f:
    json.dump(ids, f, ensure_ascii=False, indent=2)

with open(OUT_DIR / "embedding_manifest.json", "w", encoding="utf-8") as f:
    json.dump({
        "model": MODEL_NAME,
        "dim": int(embs.shape[1]),
        "normalized": True,
        "ids_by_row": str(OUT_DIR / "ids_by_row.json"),
        "embeddings_npy": str(OUT_DIR / "bge_embeddings.npy"),
        "note": "Rows in embeddings.npy align with ids_by_row.json."
    }, f, ensure_ascii=False, indent=2)

print("Saved:", OUT_DIR / "bge_embeddings.npy")


In [None]:
# - Reads embeddings from /storage/ai/bge_embeddings.npy (L2-normalized float32)
# - Recursively splits clusters > MAX_CLUSTER_SIZE using MiniBatchKMeans
# - Saves per-cluster vectors + ids + centroids for query routing

import json, math
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.cluster import MiniBatchKMeans

# ------------------------------
# CONFIG
# ------------------------------
OUT_DIR = Path("/storage/ai")              # <-- Paperspace persistent volume
INITIAL_K = 24
MAX_CLUSTER_SIZE = 1000
RANDOM_STATE = 42
BATCH_SIZE = 4096
MAX_ITER = 200
N_INIT = 20
SHARD_DTYPE = "float32"                    # set to "float16" to halve size

# ------------------------------
# LOAD
# ------------------------------
embs_path = OUT_DIR / "bge_embeddings.npy"
ids_path  = OUT_DIR / "ids_by_row.json"
manifest_path = OUT_DIR / "embedding_manifest.json"

assert embs_path.exists(), f"Embeddings not found: {embs_path}"
assert ids_path.exists(),  f"IDs not found: {ids_path}"

embs = np.load(embs_path)                  # (n, d), L2-normalized float32
with open(ids_path, "r", encoding="utf-8") as f:
    ids_by_row = json.load(f)

# Optional: read model name from embedding manifest (for provenance)
MODEL_NAME = None
if manifest_path.exists():
    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            MODEL_NAME = json.load(f).get("model")
    except Exception:
        MODEL_NAME = None

n, d = embs.shape
print(f"Embeddings: {embs.shape} | dtype={embs.dtype} | shards dtype={SHARD_DTYPE}")

# ------------------------------
# HELPERS
# ------------------------------
def run_kmeans_dense(X_sub, k):
    km = MiniBatchKMeans(
        n_clusters=k, random_state=RANDOM_STATE,
        batch_size=BATCH_SIZE, max_iter=MAX_ITER,
        n_init=N_INIT, init="k-means++", verbose=0
    )
    labels = km.fit_predict(X_sub)
    centers = km.cluster_centers_
    # normalize centers for cosine-friendly dot
    centers /= (np.linalg.norm(centers, axis=1, keepdims=True) + 1e-12)
    return labels, centers

def split_count(size, cap): 
    return max(2, math.ceil(size / cap))

# ------------------------------
# FIRST PASS + RECURSIVE SPLIT
# ------------------------------
labels0, centers0 = run_kmeans_dense(embs, INITIAL_K)
clusters = [np.where(labels0 == c)[0] for c in range(INITIAL_K)]

final_clusters = []
final_centroids = []
queue = list(range(len(clusters)))

while queue:
    i = queue.pop()
    idx = clusters[i]
    if idx.size == 0:
        continue
    if idx.size <= MAX_CLUSTER_SIZE:
        # centroid = normalized mean
        centroid = embs[idx].mean(axis=0)
        norm = np.linalg.norm(centroid)
        if norm > 0: centroid /= norm
        final_clusters.append(idx)
        final_centroids.append(centroid.astype(np.float32))
    else:
        X_sub = embs[idx]
        k_sub = split_count(idx.size, MAX_CLUSTER_SIZE)
        sub_labels, _ = run_kmeans_dense(X_sub, k_sub)
        for sc in range(k_sub):
            sub_idx = idx[np.where(sub_labels == sc)[0]]
            clusters.append(sub_idx)
            queue.append(len(clusters) - 1)

# ------------------------------
# SAVE SHARDS + INDEXES
# ------------------------------
cluster_index = {}
for k, idx in enumerate(final_clusters, start=1):
    vecs_path = OUT_DIR / f"emb_cluster_{k}.npy"
    ids_out   = OUT_DIR / f"ids_cluster_{k}.json"

    # Optionally store shard vectors as float16 to save space
    shard = embs[idx].astype(np.float16 if SHARD_DTYPE == "float16" else np.float32)
    np.save(vecs_path, shard)
    with open(ids_out, "w", encoding="utf-8") as f:
        json.dump([str(ids_by_row[i]) for i in idx], f, ensure_ascii=False, indent=2)

    cluster_index[str(k)] = {
        "size": int(idx.size),
        "vectors_path": str(vecs_path),
        "ids_path": str(ids_out)
    }

centroids = np.vstack(final_centroids) if final_centroids else np.zeros((0, d), dtype=np.float32)
np.save(OUT_DIR / "centroids.npy", centroids.astype(np.float32))   # keep centroids in fp32

# Row → cluster map
row_to_cluster = np.empty(n, dtype=np.int32)
for cluster_id, idx in enumerate(final_clusters, start=1):
    row_to_cluster[idx] = cluster_id
id_to_cluster = {str(ids_by_row[i]): int(row_to_cluster[i]) for i in range(n)}
with open(OUT_DIR / "id_to_cluster.json", "w", encoding="utf-8") as f:
    json.dump(id_to_cluster, f, ensure_ascii=False, indent=2)

# Sizes CSV
pd.DataFrame(
    {"cluster_id": list(map(int, cluster_index.keys())), 
     "size": [v["size"] for v in cluster_index.values()]}
).to_csv(OUT_DIR / "cluster_sizes_embeddings.csv", index=False)

# Manifest
with open(OUT_DIR / "emb_kmeans_manifest.json", "w", encoding="utf-8") as f:
    json.dump({
        "model": MODEL_NAME or "unknown",
        "centroids_npy": str(OUT_DIR / "centroids.npy"),
        "cluster_index": cluster_index,
        "id_to_cluster": str(OUT_DIR / "id_to_cluster.json"),
        "cluster_sizes_csv": str(OUT_DIR / "cluster_sizes_embeddings.csv"),
        "notes": {
            "l2_norm": "All vectors and centroids L2-normalized; use dot product for cosine.",
            "routing": "Pick top 1–3 nearest centroids for query; then search in those shards.",
            "shard_dtype": SHARD_DTYPE
        }
    }, f, ensure_ascii=False, indent=2)

print(f"✅ Final clusters: {len(final_clusters)} | "
      f"min={min([arr.size for arr in final_clusters])} "
      f"max={max([arr.size for arr in final_clusters])} | "
      f"saved under {OUT_DIR}")

In [None]:
# Query → nearest centroids → shard search (with full-record printing)
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

OUT_DIR = Path("/storage/ai")
MODEL_NAME = "BAAI/bge-base-en-v1.5"

# --- Build ID -> full record index (rebuilt every run) ---
def build_record_index(out_dir: Path):
    df_path = out_dir / "articles_df.json"
    df = pd.read_json(df_path, orient="records")
    # be robust to either 'ID' or 'ArticleId'
    id_col = "ID" if "ID" in df.columns else ("ArticleId" if "ArticleId" in df.columns else None)
    if id_col is None:
        raise KeyError("Neither 'ID' nor 'ArticleId' column found in articles_df.json")
    # ensure strings for keys
    df[id_col] = df[id_col].astype(str)
    # optional: drop NaNs for prettier output
    return {row[id_col]: {k: v for k, v in row.items() if pd.notna(v)} for row in df.to_dict(orient="records")}

record_by_id = build_record_index(OUT_DIR)

# --- Load model (CPU is fine; use 'cuda' if available) ---
model = SentenceTransformer(MODEL_NAME, device="cpu")

# --- Load centroids (0-based rows = final clusters) ---
centroids = np.load(OUT_DIR / "centroids.npy", allow_pickle=False).astype(np.float32)

def embed_query(text: str):
    """Embed a query string with BGE, normalize for cosine dot products."""
    q = model.encode([text], convert_to_numpy=True, normalize_embeddings=False)
    q = normalize(q, norm="l2", copy=False).astype(np.float32)
    return q  # shape (1, d)

def top_centroids(q_vec, topn=3):
    """Return indices (0-based) of nearest centroids and their sims."""
    sims = (q_vec @ centroids.T).ravel()   # cosine (dot product; both L2-normalized)
    order = np.argsort(-sims)[:topn]
    return order, sims[order]

def _load_shard_vectors(vecs_path: Path) -> np.ndarray:
    """Load shard vectors saved as float32 or float16; return float32."""
    arr = np.load(vecs_path)
    if arr.dtype != np.float32:
        arr = arr.astype(np.float32)
    return arr  # (N_shard, d) L2-normalized

def search_shards(q_vec, cluster_ids_1based, topk=10):
    """Search within given 1-based cluster ids; return sorted topk matches."""
    results = []
    for cluster_id in cluster_ids_1based:
        ids_path = OUT_DIR / f"ids_cluster_{cluster_id}.json"
        vecs_path = OUT_DIR / f"emb_cluster_{cluster_id}.npy"

        # load shard ids and vectors
        with open(ids_path, "r", encoding="utf-8") as f:
            shard_ids = json.load(f)
        Xc = _load_shard_vectors(vecs_path)  # float32

        # cosine = dot since everything is L2-normalized
        scores = (q_vec @ Xc.T).ravel()
        local_top = np.argsort(-scores)[:topk]
        for i in local_top:
            if scores[i] > 0:
                results.append({
                    "id": str(shard_ids[i]),
                    "score": float(scores[i]),
                    "cluster_id": int(cluster_id)
                })

    results.sort(key=lambda r: -r["score"])
    return results[:topk]

# --- Example query ---
query = (
    "Many people believe there was a conspiracy to kill JFK involving high-ranking officials. "
    "Declassified 2017 documents mentioned internal agency awareness of plans prior to Nov 22, 1963."
)
q_vec = embed_query(query)

# Route to nearest 1–3 centroids (centroids are 0-based; files are 1-based)
centroid_order, centroid_scores = top_centroids(q_vec, topn=3)
candidate_clusters = [int(i) + 1 for i in centroid_order]

# Search in those clusters
hits = search_shards(q_vec, candidate_clusters, topk=10)

# Print line + full JSON for each hit
for h in hits[:10]:
    _id = h["id"]
    cluster_id = h["cluster_id"]
    score = h["score"]
    print(f"[cluster {cluster_id}] {_id}  score={score:.4f}")

    rec = record_by_id.get(_id)
    if rec is not None:
        print(json.dumps(rec, ensure_ascii=False, indent=2))
    else:
        print(f"(No full record found for ID={_id})")