# Similarity Search Benchmarking

This notebook generates a synthetic catalog, embeds it with the cached models, and benchmarks several HNSW index configurations for accuracy (recall@k) and latency.

In [1]:
"""Ensure repository modules are importable and ClickHouse vector support is enabled."""
import os
import sys
from pathlib import Path

project_root = Path.cwd().resolve()
if not (project_root / "python").exists() and (project_root.parent / "python").exists():
    project_root = project_root.parent

python_src = project_root / "python"
if python_src.exists() and str(python_src) not in sys.path:
    sys.path.insert(0, str(python_src))

os.environ.setdefault("PYTHONPATH", str(python_src))
os.environ.setdefault("CLICKHOUSE_ENABLE_VECTOR_EXPERIMENTAL_TYPE", "1")
project_root

PosixPath('/home/jovyan/work')

In [2]:
"""Import dependencies used across the benchmark."""
import random
import time
from dataclasses import dataclass
from typing import Dict, List

import numpy as np
import pandas as pd

from warehouse import config
from warehouse.clickhouse import client_session
from warehouse.embeddings import embed_texts

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
"""Load the application configuration so we know which hosts and models to target."""
cfg = config.load_config()
cfg

AppConfig(clickhouse=ClickHouseSettings(host='clickhouse-server', native_port=9000, http_port=8123, user='default', password='clickhouse', database='default'), s3=S3Settings(endpoint_url='http://seaweedfs:8333', region='us-east-1', access_key='s3admin', secret_key='s3secret', bucket='clickhouse-demo'), models=ModelSettings(primary='BAAI/bge-base-en-v1.5', secondary='Alibaba-NLP/gte-Qwen2-1.5B-instruct', active='BAAI/bge-base-en-v1.5'), paths=PathSettings(assets_dir=PosixPath('/home/jovyan/work/assets'), model_cache_dir=PosixPath('/home/jovyan/work/assets/models'), data_dir=PosixPath('/home/jovyan/work/assets/data')))

In [4]:
"""Synthesize a reproducible catalog of product-style descriptions."""
DATASET_SIZE = 600
random_seed = random.Random(1337)

category_templates: Dict[str, List[str]] = {
    "electronics": [
        "Wireless earbuds tuned for commuter-friendly noise blocking.",
        "Compact smart speaker with room-aware adaptive sound.",
        "Lightweight laptop built for remote-first engineering teams.",
        "4K action camera ready for cold-weather adventures.",
        "Portable projector for pop-up backyard movie nights.",
    ],
    "apparel": [
        "Breathable trail shoes designed for mixed terrain mileage.",
        "Waterproof shell jacket with recycled performance fibers.",
        "High-rise leggings that stay opaque through HIIT sessions.",
        "Classic denim jacket updated with stretch panels.",
        "Merino hiking socks that regulate temperature on long climbs.",
    ],
    "home": [
        "Smart thermostat that learns weekly occupancy rhythms.",
        "Self-watering herb garden for light-starved kitchens.",
        "Cordless vacuum built for pet-friendly apartments.",
        "Quiet air purifier targeted at open loft layouts.",
        "Stackable storage cubes for modular studio organization.",
    ],
    "beauty": [
        "Vitamin C serum blended for sensitive complexions.",
        "Matte lipstick that resists mask transfer.",
        "Hydrating night mask focused on barrier repair.",
        "Mineral sunscreen that vanishes on deeper skin tones.",
        "Detox scalp scrub balancing curl-friendly routines.",
    ],
    "outdoors": [
        "Ultralight backpack sized for fastpacking weekends.",
        "Carbon trekking poles tuned for alpine approaches.",
        "Four-season tent with storm-rated ventilation.",
        "Packable hammock built for riverbank campsites.",
        "Rechargeable lantern that doubles as a power bank.",
    ],
    "books": [
        "A space-opera opener following a reluctant diplomat.",
        "Climate fiction anthology curated by emerging voices.",
        "Design leadership handbook for distributed product teams.",
        "Slow-burn mystery set in a remote coastal village.",
        "Field guide celebrating edible plants of the northeast.",
    ],
}

feature_phrases = [
    "Ships with concierge onboarding and live chat support.",
    "Pairs with automation recipes shared by the community.",
    "Arrives in packaging that is fully curbside recyclable.",
    "Backed by lab-verified durability benchmarks.",
    "Optimized after A/B testing with power users.",
    "Configured for quick maintenance swaps in the field.",
    "Includes lifetime access to the how-to video library.",
]

context_phrases = [
    "Frequently bundled with complementary accessories for launch promotions.",
    "Documented in detail inside the internal runbook for customer success.",
    "Benchmarked against leading alternatives during the latest GTM sprint.",
    "Highlighted in usability testing notes from the spring cohort.",
]

records = []
for item_id in range(1, DATASET_SIZE + 1):
    category = random_seed.choice(list(category_templates.keys()))
    base = random_seed.choice(category_templates[category])
    feature = random_seed.choice(feature_phrases)
    description_parts = [base, feature]
    if item_id % 17 == 0:
        description_parts.append(random_seed.choice(context_phrases))
    description = " ".join(description_parts)
    records.append({"item_id": item_id, "category": category, "text": description})

dataset_df = pd.DataFrame(records)
dataset_df.head()

Unnamed: 0,item_id,category,text
0,1,outdoors,Rechargeable lantern that doubles as a power b...
1,2,home,Stackable storage cubes for modular studio org...
2,3,books,Climate fiction anthology curated by emerging ...
3,4,home,Quiet air purifier targeted at open loft layou...
4,5,home,Cordless vacuum built for pet-friendly apartme...


In [5]:
"""Embed the dataset with both cached models and keep normalized vectors in memory."""
MODEL_MAP = {
    "primary": cfg.models.primary,
    "secondary": cfg.models.secondary,
}

normalized_embeddings: Dict[str, np.ndarray] = {}
embedding_dimensions: Dict[str, int] = {}

for label, model_name in MODEL_MAP.items():
    vectors = np.asarray(
        embed_texts(dataset_df["text"].tolist(), model_name=model_name, config=cfg),
        dtype=np.float32,
    )
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    vectors = vectors / np.clip(norms, a_min=1e-12, a_max=None)
    normalized_embeddings[label] = vectors
    embedding_dimensions[label] = vectors.shape[1]

embedding_dimensions

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.88it/s]
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


{'primary': 768, 'secondary': 1536}

In [6]:
"""Utility helpers to manage ClickHouse tables and evaluate ANN recall/latency."""


@dataclass(frozen=True)
class IndexSpec:
    name: str
    params: Dict[str, object]
    ef_search: int
    description: str




def _format_index_literal(value: object) -> str:
    """Render values as ClickHouse literals for index configuration."""
    if isinstance(value, str):
        escaped = value.replace("'", "''")
        return f"'{escaped}'"
    if isinstance(value, bool):
        return "1" if value else "0"
    if isinstance(value, (int, float)):
        return str(value)
    raise TypeError(f"Unsupported index parameter type: {type(value)!r}")




def _vector_index_expression(dimension: int, spec: IndexSpec) -> str:
    """Build the vector_similarity(...) expression with optional HNSW overrides."""
    base_args = ["'hnsw'", "'cosineDistance'", str(dimension)]
    if spec.params:
        quantization = spec.params.get("quantization")
        m_value = spec.params.get("hnsw_max_connections_per_layer", spec.params.get("m"))
        ef_construction = spec.params.get(
            "hnsw_candidate_list_size_for_construction",
            spec.params.get("ef_construction"),
        )
        if quantization is None and (m_value is not None or ef_construction is not None):
            quantization = "bf16"
        if quantization is not None or m_value is not None or ef_construction is not None:
            optional_args = [
                _format_index_literal(quantization if quantization is not None else "bf16"),
                _format_index_literal(m_value if m_value is not None else 0),
                _format_index_literal(ef_construction if ef_construction is not None else 0),
            ]
            base_args.extend(optional_args)
    return f"vector_similarity({', '.join(base_args)})"




def recreate_vector_table(table_name: str, dimension: int, spec: IndexSpec) -> None:
    index_expression = _vector_index_expression(dimension, spec)
    create_sql = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        item_id UInt32,
        category LowCardinality(String),
        embedding Array(Float32),
        CONSTRAINT embedding_length CHECK length(embedding) = {dimension},
        INDEX idx_embedding_hnsw embedding TYPE {index_expression} GRANULARITY 1
    ) ENGINE = MergeTree
    ORDER BY item_id
    """
    with client_session(cfg) as client:
        client.execute(f"DROP TABLE IF EXISTS {table_name}")
        client.execute(create_sql)




def insert_vectors(table_name: str, vectors: np.ndarray) -> None:
    payload = [
        (
            int(row.item_id),
            str(row.category),
            [float(value) for value in vector],
        )
        for row, vector in zip(dataset_df.itertuples(index=False), vectors)
    ]
    with client_session(cfg) as client:
        client.execute(
            f"INSERT INTO {table_name} (item_id, category, embedding) VALUES",
            payload,
        )




def drop_table(table_name: str) -> None:
    with client_session(cfg) as client:
        client.execute(f"DROP TABLE IF EXISTS {table_name}")




def benchmark_ann(
    table_name: str,
    vectors: np.ndarray,
    *,
    top_k: int,
    query_count: int,
    ef_search: int,
 ) -> Dict[str, float]:
    rng = np.random.default_rng(2025)
    sample_size = min(query_count, len(vectors))
    sample_indices = rng.choice(len(vectors), size=sample_size, replace=False)
    latencies_ms: List[float] = []
    recalls: List[float] = []


    query_sql = f"""
        SELECT item_id, cosineDistance(embedding, %(query_vector)s) AS score
        FROM {table_name}
        ORDER BY score ASC
        LIMIT %(limit)s
        SETTINGS hnsw_candidate_list_size_for_search = %(ef_search)s
    """


    with client_session(cfg) as client:
        for idx in sample_indices:
            query_vector_np = vectors[idx]
            query_vector = query_vector_np.tolist()
            scores = 1.0 - vectors @ query_vector_np
            exact_candidates = np.argpartition(scores, top_k)[:top_k]
            ordered = exact_candidates[np.argsort(scores[exact_candidates])]
            exact_ids = dataset_df.iloc[ordered]["item_id"].tolist()


            start = time.perf_counter()
            rows = client.execute(
                query_sql,
                {"query_vector": query_vector, "limit": top_k, "ef_search": ef_search},
            )
            latency = (time.perf_counter() - start) * 1000.0
            latencies_ms.append(latency)


            ann_ids = [row[0] for row in rows]
            overlap = len(set(ann_ids) & set(exact_ids))
            recalls.append(overlap / float(top_k))


    return {
        "queries": float(sample_size),
        "top_k": float(top_k),
        "mean_recall": float(np.mean(recalls)),
        "min_recall": float(np.min(recalls)),
        "mean_latency_ms": float(np.mean(latencies_ms)),
        "p95_latency_ms": float(np.percentile(latencies_ms, 95)),
    }



In [13]:
"""Define the index configurations and sweep parameters to compare."""

# ef_search (a.k.a. hnsw_candidate_list_size_for_search): how many neighbors HNSW keeps in its candidate queue while searching. Larger values scan a wider neighborhood, improving recall at the cost of higher latency per query.
# m (a.k.a. hnsw_max_connections_per_layer): how many edges each node is allowed to have in the HNSW graph layers. Higher m builds denser graphs that generally yield better recall but take longer to build and consume more memory.
# ef_construction (a.k.a. hnsw_candidate_list_size_for_construction): how broad the candidate queue is while the graph is built. Bigger values give the constructor more options when wiring up nodes, leading to higher-quality graphs and better eventual recall, but make ingest/index-build slower.
# ef_search during runtime (query setting): same knob as above but applied per query via ClickHouse setting (e.g., SETTINGS hnsw_candidate_list_size_for_search = ...). You can override the default to trade latency for recall without rebuilding the index.

index_specs = [
    IndexSpec(
        name="baseline",
        params={},
        ef_search=128,
        description="ClickHouse defaults (m=16, ef_construction=200, ef_search≈128)",
    ),
    IndexSpec(
        name="high_recall",
        params={"m": 64, "ef_construction": 800},
        ef_search=640,
        description="Denser graph and higher ef_search for improved recall",
    ),
    IndexSpec(
        name="low_latency",
        params={"m": 8, "ef_construction": 100},
        ef_search=64,
        description="Smaller graph with reduced ef_search to favor latency",
    ),
]

TOP_K = 10
QUERY_COUNT = 500
index_specs


[IndexSpec(name='baseline', params={}, ef_search=128, description='ClickHouse defaults (m=16, ef_construction=200, ef_search≈128)'),
 IndexSpec(name='high_recall', params={'m': 64, 'ef_construction': 800}, ef_search=640, description='Denser graph and higher ef_search for improved recall'),
 IndexSpec(name='low_latency', params={'m': 8, 'ef_construction': 100}, ef_search=64, description='Smaller graph with reduced ef_search to favor latency')]

In [14]:
"""Run the benchmark sweep across both embedding models and collect metrics."""
results = []

for model_label, model_name in MODEL_MAP.items():
    vectors = normalized_embeddings[model_label]
    dimension = embedding_dimensions[model_label]
    for spec in index_specs:
        table_name = f"benchmark_{model_label}_{spec.name}"
        recreate_vector_table(table_name, dimension, spec)
        insert_vectors(table_name, vectors)
        metrics = benchmark_ann(
            table_name,
            vectors,
            top_k=TOP_K,
            query_count=QUERY_COUNT,
            ef_search=spec.ef_search,
        )
        drop_table(table_name)
        entry = {
            "model_label": model_label,
            "model_name": model_name,
            "index_name": spec.name,
            "description": spec.description,
            "ef_search": spec.ef_search,
            "index_params": spec.params,
        }
        entry.update(metrics)
        results.append(entry)

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,model_label,model_name,index_name,description,ef_search,index_params,queries,top_k,mean_recall,min_recall,mean_latency_ms,p95_latency_ms
0,primary,BAAI/bge-base-en-v1.5,baseline,"ClickHouse defaults (m=16, ef_construction=200...",128,{},500.0,10.0,0.952,0.7,4.600902,6.658008
1,primary,BAAI/bge-base-en-v1.5,high_recall,Denser graph and higher ef_search for improved...,640,"{'m': 64, 'ef_construction': 800}",500.0,10.0,0.9512,0.7,4.57758,6.919402
2,primary,BAAI/bge-base-en-v1.5,low_latency,Smaller graph with reduced ef_search to favor ...,64,"{'m': 8, 'ef_construction': 100}",500.0,10.0,0.951,0.8,4.284843,5.513726
3,secondary,Alibaba-NLP/gte-Qwen2-1.5B-instruct,baseline,"ClickHouse defaults (m=16, ef_construction=200...",128,{},500.0,10.0,0.9562,0.7,6.585902,8.981952
4,secondary,Alibaba-NLP/gte-Qwen2-1.5B-instruct,high_recall,Denser graph and higher ef_search for improved...,640,"{'m': 64, 'ef_construction': 800}",500.0,10.0,0.954,0.7,7.096757,10.207828
5,secondary,Alibaba-NLP/gte-Qwen2-1.5B-instruct,low_latency,Smaller graph with reduced ef_search to favor ...,64,"{'m': 8, 'ef_construction': 100}",500.0,10.0,0.941,0.8,6.914042,10.672908


Review the table above to pick the configuration that balances recall and latency for your workload. Re-run the benchmark after adjusting index parameters or adding more models.