# Component 2: Vector Quantization Benchmark

In [None]:
import sys
import os
import time
sys.path.append("Components")
from vector_quantization import fit_kmeans, vq_query

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

In [None]:
doc_vectors = np.load("Data/processed/doc_vectors_w2v.npy")
metadata = pd.read_csv("Data/processed/doc_metadata.csv")

print(f"Vectors: {doc_vectors.shape}")
print(f"Metadata: {metadata.shape[0]} records")

## Helper Functions

In [None]:
def compute_ndcg_at_k(predicted, actual, k):
    """Normalized Discounted Cumulative Gain: rewards correct ordering."""
    dcg = sum([1/np.log2(i+2) for i, doc in enumerate(predicted[:k]) if doc in actual[:k]])
    idcg = sum([1/np.log2(i+2) for i in range(min(len(actual), k))])
    return dcg / idcg if idcg > 0 else 0

def compute_recall_at_k(predicted, actual, k):
    """Recall@k: fraction of true top-k found in predicted top-k."""
    return len(set(predicted[:k]) & set(actual[:k])) / k

## Experiment 1: Accuracy vs Efficiency (vary n_probes)

In [None]:
TOP_K = 10
N_QUERIES = 50
C = 200

query_indices = np.random.choice(len(doc_vectors), N_QUERIES, replace=False)

# Ground truth: exact nearest neighbors
exact_results = {}
for qi in query_indices:
    dists = euclidean_distances(doc_vectors[qi:qi+1], doc_vectors).ravel()
    exact_results[qi] = np.argsort(dists)[:TOP_K]

# Fit KMeans once
print("Fitting KMeans...")
kmeans, assignments = fit_kmeans(doc_vectors, k=C)

results = []
for n_probes in [1, 2, 5, 10, 20]:
    print(f"Testing n_probes={n_probes}")
    recalls = []
    ndcgs = []
    candidate_ratios = []
    query_times = []
    
    for qi in query_indices:
        # Measure query time
        t0 = time.perf_counter()
        indices, _ = vq_query(doc_vectors[qi], doc_vectors, kmeans, assignments, 
                              top_n=TOP_K, n_probes=n_probes)
        query_times.append(time.perf_counter() - t0)
        
        # Metrics
        recalls.append(compute_recall_at_k(indices, exact_results[qi], TOP_K))
        ndcgs.append(compute_ndcg_at_k(indices, exact_results[qi], TOP_K))
        
        # Candidate ratio
        cluster_ids = np.argsort(euclidean_distances(
            doc_vectors[qi:qi+1], kmeans.cluster_centers_
        ).ravel())[:n_probes]
        n_candidates = np.sum(np.isin(assignments, cluster_ids))
        candidate_ratios.append(n_candidates / len(doc_vectors))
    
    results.append({
        "method": "VQ",
        "n_probes": n_probes,
        "c_clusters": C,
        "recall_at_k": np.mean(recalls),
        "ndcg_at_k": np.mean(ndcgs),
        "candidate_ratio": np.mean(candidate_ratios),
        "query_time": np.mean(query_times),
        "N": len(doc_vectors),
        "dim": doc_vectors.shape[1],
    })

df_exp1 = pd.DataFrame(results)
print("\n=== Experiment 1: Accuracy vs Efficiency ===")
display(df_exp1)

## Experiment 2: Scaling with N

In [None]:
N_LIST = [1000, 2000, 5000, 10000, min(20000, len(doc_vectors))]
TEST_QUERIES = 10
BEST_N_PROBES = 5

scaling_results = []

for N in N_LIST:
    print(f"Testing N={N}")
    X = doc_vectors[:N]
    test_idx = np.random.choice(N, TEST_QUERIES, replace=False)
    
    t0 = time.perf_counter()
    kmeans, assignments = fit_kmeans(X, k=min(200, N//10))
    build_t = time.perf_counter() - t0
    
    q_times = []
    for qi in test_idx:
        t1 = time.perf_counter()
        vq_query(X[qi], X, kmeans, assignments, top_n=TOP_K, n_probes=BEST_N_PROBES)
        q_times.append(time.perf_counter() - t1)
    
    scaling_results.append({
        "method": "VQ",
        "N": N,
        "dim": X.shape[1],
        "build_time": build_t,
        "query_time": np.mean(q_times),
    })

df_exp2 = pd.DataFrame(scaling_results)
print("\n=== Experiment 2: Scaling with N ===")
display(df_exp2)

## Experiment 3: Scaling with Dimensionality

In [None]:
DIM_LIST = [50, 100, 200]
N_SAMPLE = 10000
TEST_QUERIES = 10

dim_results = []
X_sample = doc_vectors[:N_SAMPLE]

for d in DIM_LIST:
    print(f"Testing d={d}")
    X = X_sample[:, :d]
    test_idx = np.random.choice(N_SAMPLE, TEST_QUERIES, replace=False)
    
    t0 = time.perf_counter()
    kmeans, assignments = fit_kmeans(X, k=200)
    build_t = time.perf_counter() - t0
    
    q_times = []
    for qi in test_idx:
        t1 = time.perf_counter()
        vq_query(X[qi], X, kmeans, assignments, top_n=TOP_K, n_probes=BEST_N_PROBES)
        q_times.append(time.perf_counter() - t1)
    
    dim_results.append({
        "method": "VQ",
        "N": N_SAMPLE,
        "dim": d,
        "build_time": build_t,
        "query_time": np.mean(q_times),
    })

df_exp3 = pd.DataFrame(dim_results)
print("\n=== Experiment 3: Scaling with Dimensionality ===")
display(df_exp3)

## Save Results

In [None]:
results_dir = "Data/results"
os.makedirs(results_dir, exist_ok=True)

df_exp1.to_csv(f"{results_dir}/vq_accuracy_efficiency.csv", index=False)
df_exp2.to_csv(f"{results_dir}/vq_scaling_N.csv", index=False)
df_exp3.to_csv(f"{results_dir}/vq_scaling_dim.csv", index=False)

print("âœ“ All VQ results saved")

## Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Recall vs Candidate Ratio
axes[0,0].plot(df_exp1["candidate_ratio"], df_exp1["recall_at_k"], 'o-', linewidth=2)
axes[0,0].set(xlabel="Candidate Ratio", ylabel="Recall@10", 
              title="VQ: Accuracy vs Efficiency")
axes[0,0].grid(alpha=0.3)

# nDCG vs Candidate Ratio
axes[0,1].plot(df_exp1["candidate_ratio"], df_exp1["ndcg_at_k"], 's-', 
               linewidth=2, color='orange')
axes[0,1].set(xlabel="Candidate Ratio", ylabel="nDCG@10", 
              title="VQ: Ranking Quality")
axes[0,1].grid(alpha=0.3)

# Scaling with N
ax2 = axes[1,0]
ax2.plot(df_exp2["N"], df_exp2["build_time"], 'o-', label="Build", linewidth=2)
ax2_twin = ax2.twinx()
ax2_twin.plot(df_exp2["N"], df_exp2["query_time"], 's-', color='orange', 
              label="Query", linewidth=2)
ax2.set(xlabel="N (documents)", ylabel="Build Time (s)", 
        title="VQ: Scaling with N")
ax2_twin.set_ylabel("Query Time (s)")
ax2.legend(loc='upper left')
ax2_twin.legend(loc='upper right')
ax2.grid(alpha=0.3)

# Scaling with Dim
ax3 = axes[1,1]
ax3.plot(df_exp3["dim"], df_exp3["build_time"], 'o-', label="Build", linewidth=2)
ax3_twin = ax3.twinx()
ax3_twin.plot(df_exp3["dim"], df_exp3["query_time"], 's-', color='orange', 
              label="Query", linewidth=2)
ax3.set(xlabel="Dimensionality", ylabel="Build Time (s)", 
        title="VQ: Scaling with Dimensions")
ax3_twin.set_ylabel("Query Time (s)")
ax3.legend(loc='upper left')
ax3_twin.legend(loc='upper right')
ax3.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f"{results_dir}/vq_benchmark_summary.png", dpi=150, bbox_inches="tight")
plt.show()