# Component 3: Locality-Sensitive Hashing (LSH)

In [None]:
import sys
sys.path.append("Components")
from lsh import build_lsh_from_vectors, lsh_query, lsh_candidates, _cosine_similarities

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=4, suppress=True)

In [None]:
doc_vectors = np.load("Data/processed/doc_vectors_w2v.npy")
metadata = pd.read_csv("Data/processed/doc_metadata.csv")

print(f"Vectors: {doc_vectors.shape}")
print(f"Metadata: {metadata.shape[0]} records")
metadata.head(3)

In [None]:
M = 80
B = 20

lsh_struct = build_lsh_from_vectors(doc_vectors, n_hashes=M, n_bands=B, random_state=42)
hyperplanes = lsh_struct["hyperplanes"]
index = lsh_struct["index"]

r = index["rows_per_band"]
print(f"✓ LSH built: m={M}, b={B}, r={r}")

buckets = [len(t) for t in index["band_tables"]]
print(f"Buckets per band: {np.mean(buckets):.1f} (min={np.min(buckets)}, max={np.max(buckets)})")

In [None]:
QUERY_INDEX = 42
TOP_K = 10

query_vec = doc_vectors[QUERY_INDEX]
lsh_indices, lsh_sims = lsh_query(query_vec, doc_vectors, hyperplanes, index, top_k=TOP_K)

# Compute exact results for comparison
exact_sims = _cosine_similarities(query_vec, doc_vectors)
exact_indices = np.argsort(-exact_sims)[:TOP_K]

# Candidate stats
proj_q = hyperplanes @ query_vec
query_sig = (proj_q >= 0).astype(np.uint8)
candidates = lsh_candidates(query_sig, index)

print(f"Query: {metadata.loc[QUERY_INDEX, 'title']} ({metadata.loc[QUERY_INDEX, 'decade']})")
print(f"Candidates: {len(candidates)}/{len(doc_vectors)} ({100*len(candidates)/len(doc_vectors):.1f}%)")

print("\nNearest neighbors (LSH):")
print("-" * 60)
for rank, (idx, sim) in enumerate(zip(lsh_indices, lsh_sims), 1):
    print(f"{rank:2d}. {metadata.loc[idx, 'title']:40s} | cos={sim:.4f}")

In [None]:
recall = len(set(lsh_indices) & set(exact_indices)) / TOP_K
candidate_ratio = len(candidates) / len(doc_vectors)

print(f"\nRecall@{TOP_K}: {recall:.3f}")
print(f"Candidate ratio: {candidate_ratio:.3f}")
print(f"Overlap: {sorted(set(lsh_indices) & set(exact_indices))}")

# Save benchmark result
import os
results_dir = "Data/results"
os.makedirs(results_dir, exist_ok=True)

result = pd.DataFrame([{
    "method": "LSH",
    "m": M,
    "b": B,
    "r": r,
    "top_k": TOP_K,
    "recall_at_k": recall,
    "candidate_ratio": candidate_ratio,
}])

result_path = f"{results_dir}/lsh_benchmark_runs.csv"
result.to_csv(result_path, mode="a", header=not os.path.exists(result_path), index=False)
print(f"✓ Result saved to {result_path}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

# Similarity profile
ax1.bar(range(len(lsh_sims)), lsh_sims)
ax1.set(xlabel="Rank", ylabel="Cosine similarity", title="LSH Similarity Profile")

# LSH vs Exact
exact_sims_for_lsh = exact_sims[lsh_indices]
ax2.scatter(exact_sims_for_lsh, lsh_sims, alpha=0.7)
ax2.plot([0, 1], [0, 1], 'r--', alpha=0.3)
ax2.set(xlabel="Exact cosine", ylabel="LSH cosine", title="LSH vs Exact")

plt.tight_layout()
plt.show()