# beyond-vector-search â€” Databricks demo

This notebook runs the repo **inside Databricks** (Repos checkout) without any external network calls.

It demonstrates:
- **Adaptive retrieval routing** (keyword vs vector)
- **Offline evaluation loop** that updates router weights
- **SQLite telemetry** inspection

> Tip: If you want the telemetry DB to persist across cluster restarts, set `DB_PATH` to a DBFS location (example below).


In [None]:
from __future__ import annotations

import os
import sys
from pathlib import Path


def find_repo_root(start: Path | None = None) -> Path:
    """Find the repo root by walking up until we find pyproject.toml."""
    p = (start or Path.cwd()).resolve()
    for _ in range(12):
        if (p / "pyproject.toml").exists():
            return p
        p = p.parent
    raise RuntimeError("Could not find repo root (pyproject.toml not found).")


REPO_ROOT = find_repo_root()
SRC_DIR = REPO_ROOT / "src"

# Make the package importable without pip install.
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Telemetry backend selection:
# - Default (local/dev): SQLite
# - Databricks: Lakebase Postgres (OLTP)
#
# For Lakebase telemetry, set these env vars in the cluster (recommended) or in a cell:
#   BVS_TELEMETRY=lakebase
#   BVS_LAKEBASE_DSN=postgresql://USER:PASSWORD@HOST:5432/DBNAME
#   BVS_LAKEBASE_RUNS_TABLE=beyond_vector_search_runs
#   BVS_LAKEBASE_STATE_TABLE=beyond_vector_search_router_state
#
# If you keep SQLite, you can set BVS_DB_PATH to a DBFS path like:
#   /dbfs/tmp/beyond_vector_search.sqlite
DB_PATH = os.environ.get("BVS_DB_PATH", str(REPO_ROOT / "runs" / "beyond_vector_search.sqlite"))

print("REPO_ROOT:", REPO_ROOT)
print("PYTHONPATH[0]:", sys.path[0])
print("BVS_TELEMETRY:", os.environ.get("BVS_TELEMETRY", "sqlite"))
print("SQLite DB_PATH (if used):", DB_PATH)
print("Lakebase DSN set:", bool(os.environ.get("BVS_LAKEBASE_DSN")))
print("Lakebase runs table:", os.environ.get("BVS_LAKEBASE_RUNS_TABLE"))
print("Lakebase state table:", os.environ.get("BVS_LAKEBASE_STATE_TABLE"))


In [None]:
import os

# --- Lakebase Postgres telemetry (Databricks OLTP) ---
# Set a Lakebase Postgres DSN. Recommended: store credentials in Databricks Secrets and build DSN here.
# Example DSN format:
#   postgresql://USER:PASSWORD@HOST:5432/DBNAME
#
# You can set this as a cluster env var instead:
#   BVS_LAKEBASE_DSN=...
#
# NOTE: This requires a Postgres driver in the cluster (psycopg or psycopg2).

os.environ.setdefault("BVS_TELEMETRY", "lakebase")

# TODO: replace with your real DSN (prefer secrets):
# os.environ["BVS_LAKEBASE_DSN"] = "postgresql://..."

# Optional: customize table names in the Lakebase database
os.environ.setdefault("BVS_LAKEBASE_RUNS_TABLE", "beyond_vector_search_runs")
os.environ.setdefault("BVS_LAKEBASE_STATE_TABLE", "beyond_vector_search_router_state")

print("Using telemetry:", os.environ.get("BVS_TELEMETRY"))
print("Runs table:", os.environ.get("BVS_LAKEBASE_RUNS_TABLE"))
print("State table:", os.environ.get("BVS_LAKEBASE_STATE_TABLE"))
print("DSN set:", bool(os.environ.get("BVS_LAKEBASE_DSN")))

from beyond_vector_search.run import run_once

out = run_once(query="How to fix INC-10010?", k=5, db_path=DB_PATH)
out


In [None]:
from beyond_vector_search.evaluate import evaluate_all

report = evaluate_all(k=5, db_path=DB_PATH)
{
  "mean_score": report["mean_score"],
  "n": report["n"],
  "router_state": report["router_state"],
}


In [None]:
# Inspect the most recent runs (Lakebase Postgres)
import os

dsn = os.environ["BVS_LAKEBASE_DSN"]
runs_table = os.environ.get("BVS_LAKEBASE_RUNS_TABLE", "beyond_vector_search_runs")

try:
    import psycopg  # type: ignore

    conn = psycopg.connect(dsn)
    cur = conn.cursor()
    cur.execute(f"SELECT run_id, ts_unix, strategy, score, query FROM {runs_table} ORDER BY run_id DESC LIMIT 10")
    rows = cur.fetchall()
    cur.close()
    conn.close()
    rows
except Exception:
    import psycopg2  # type: ignore

    conn = psycopg2.connect(dsn)
    cur = conn.cursor()
    cur.execute(f"SELECT run_id, ts_unix, strategy, score, query FROM {runs_table} ORDER BY run_id DESC LIMIT 10")
    rows = cur.fetchall()
    cur.close()
    conn.close()
    rows


In [None]:
# Inspect the current router state (Lakebase Postgres)
import os

dsn = os.environ["BVS_LAKEBASE_DSN"]
state_table = os.environ.get("BVS_LAKEBASE_STATE_TABLE", "beyond_vector_search_router_state")

try:
    import psycopg  # type: ignore

    conn = psycopg.connect(dsn)
    cur = conn.cursor()
    cur.execute(f"SELECT key, value_json FROM {state_table} ORDER BY key")
    rows = cur.fetchall()
    cur.close()
    conn.close()
    rows
except Exception:
    import psycopg2  # type: ignore

    conn = psycopg2.connect(dsn)
    cur = conn.cursor()
    cur.execute(f"SELECT key, value_json FROM {state_table} ORDER BY key")
    rows = cur.fetchall()
    cur.close()
    conn.close()
    rows


## Notes

- If you want to **reset** learning, delete the SQLite file at `DB_PATH` (or point `BVS_DB_PATH` to a new file).
- The core decision logic lives in `src/beyond_vector_search/router.py`.
- The offline loop that updates weights lives in `src/beyond_vector_search/evaluate.py`.
- The architecture diagram is `diagrams/architecture.html`.
