# Search Telemetry Explorer
Use this notebook to hydrate `logs/search-history.jsonl`, inspect run-level details, and surface Ops Deck trends.

Parameters like `SEARCH_LEDGER_PATH`, `WINDOW_DAYS`, and `PATTERN_FILTER` can be injected via Papermill to scope analyses.

In [1]:
# Papermill-friendly parameters (can be overridden at runtime)
ROOT = globals().get("ROOT")
TELEMETRY_LOG_PATH = globals().get("TELEMETRY_LOG_PATH")
SEARCH_LEDGER_PATH = globals().get("SEARCH_LEDGER_PATH")
WINDOW_DAYS = globals().get("WINDOW_DAYS", 14)
PATTERN_FILTER = globals().get("PATTERN_FILTER")
PRESET_TAGS_PATH = globals().get("PRESET_TAGS_PATH")
PRESET_DRIFT_LOOKBACK = globals().get("PRESET_DRIFT_LOOKBACK", 50)

PARAM_ROOT = ROOT
PARAM_LOG_PATH = TELEMETRY_LOG_PATH
PARAM_LEDGER_PATH = SEARCH_LEDGER_PATH
WINDOW_DAYS_PARAM = WINDOW_DAYS
PATTERN_FILTER_PARAM = PATTERN_FILTER
PRESET_TAGS_PATH_PARAM = PRESET_TAGS_PATH
PRESET_DRIFT_LOOKBACK_PARAM = PRESET_DRIFT_LOOKBACK


In [2]:
# Parameters
DB_PATH = "C:\\Users\\Nihil\\AppData\\Local\\Temp\\pytest-of-Nihil\\pytest-92\\test_notebooks_execute_search_0\\interactions.db"
SEARCH_LEDGER_PATH = "C:\\Users\\Nihil\\AppData\\Local\\Temp\\pytest-of-Nihil\\pytest-92\\test_notebooks_execute_search_0\\search_telemetry.json"


In [3]:
from __future__ import annotations

from pathlib import Path
import sys

ROOT = Path(PARAM_ROOT or Path.cwd().parents[1])
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import pandas as pd
import plotly.express as px

from datalab.scripts import search_telemetry as telemetry

LOG_PATH = Path(PARAM_LOG_PATH or ROOT / "logs" / "search-history.jsonl")
SEARCH_LEDGER_PATH = Path(PARAM_LEDGER_PATH or ROOT / "data" / "search_telemetry.json")
try:
    WINDOW_DAYS = int(WINDOW_DAYS_PARAM if WINDOW_DAYS_PARAM not in (None, "") else 14)
except (TypeError, ValueError):
    WINDOW_DAYS = 14
PATTERN_FILTER = (str(PATTERN_FILTER_PARAM).strip() or None) if PATTERN_FILTER_PARAM else None
PRESET_TAGS_PATH = Path(PRESET_TAGS_PATH_PARAM) if PRESET_TAGS_PATH_PARAM else ROOT / "configs" / "search_preset_tags.json"
try:
    PRESET_DRIFT_LOOKBACK = (
        int(PRESET_DRIFT_LOOKBACK_PARAM)
        if PRESET_DRIFT_LOOKBACK_PARAM not in (None, "")
        else 50
    )
except (TypeError, ValueError):
    PRESET_DRIFT_LOOKBACK = 50

## Hydrate the log
The ingestion helper keeps inserts idempotent by hashing each JSON line before writing the consolidated ledger.

In [4]:
stats = telemetry.ingest_search_history(LOG_PATH, SEARCH_LEDGER_PATH)
stats.as_dict()

2025-11-18 04:33:50,761 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-18 04:33:50,771 INFO sqlalchemy.engine.Engine INSERT INTO tail_log_entries (id, message, source, created_at) VALUES (?, ?, ?, ?)


2025-11-18 04:33:50,774 INFO sqlalchemy.engine.Engine [generated in 0.00387s] ('4f1fae50-ca8c-4acf-83e0-fc26b7022f9a', 'search-ledger ingest: 1 runs (24h 0) · match-rate 1.00', 'search-ledger', '2025-11-17 17:33:50.737845')


2025-11-18 04:33:50,778 INFO sqlalchemy.engine.Engine COMMIT


2025-11-18 04:33:50,783 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-18 04:33:50,787 INFO sqlalchemy.engine.Engine SELECT tail_log_entries.id, tail_log_entries.message, tail_log_entries.source, tail_log_entries.created_at 
FROM tail_log_entries 
WHERE tail_log_entries.id = ?


2025-11-18 04:33:50,790 INFO sqlalchemy.engine.Engine [generated in 0.00275s] ('4f1fae50-ca8c-4acf-83e0-fc26b7022f9a',)


2025-11-18 04:33:50,792 INFO sqlalchemy.engine.Engine ROLLBACK


{'total_runs': 1,
 'inserted': 0,
 'previous_total': 1,
 'summary': {'generated_at': '2025-11-17T17:33:50.707306+00:00',
  'source_log': 'D:\\Files\\Code 3\\ChatAI-DataLab\\logs\\search-history.jsonl',
  'total_runs': 1,
  'runs_with_matches': 1,
  'runs_last_24h': 0,
  'avg_duration_ms': 977.0,
  'avg_match_density': 0.0157,
  'last_ingest_at': '2025-11-15T13:50:46.784818+00:00',
  'match_rate': 1.0,
  'top_patterns': [{'pattern': 'GraphExecutor',
    'runs': 1,
    'total_matches': 14,
    'avg_files_scanned': 889.0}],
  'daily_metrics': [{'event_date': '2025-11-15',
    'runs': 1,
    'files_scanned': 889,
    'matches': 14,
    'runs_with_matches': 1,
    'avg_duration_ms': 977.0,
    'avg_match_density': 0.015748031496062992}],
  'preset_drift': [],
  'runs': [{'timestamp': '2025-11-15T13:50:46.784818+00:00',
    'pattern': 'GraphExecutor',
    'preset': '',
    'files_scanned': 889,
    'matches': 14,
    'duration_ms': 977}],
  'metadata': {'log_entries': 1,
   'log_path': 'D:\\

## Dataset snapshot & filters
Confirm how many runs are available after applying any window or pattern filters.

In [5]:
window_start = None
filter_summary = {
    "total_runs_loaded": 0,
    "runs_after_filters": 0,
    "window_start": "—",
    "pattern_filter": PATTERN_FILTER or "—"
}
runs_df = telemetry.load_search_runs(SEARCH_LEDGER_PATH)
if runs_df.empty:
    filtered_runs_df = runs_df
    display(pd.DataFrame([filter_summary]))
else:
    runs_df["timestamp_dt"] = pd.to_datetime(runs_df["timestamp"], utc=True, errors="coerce")
    runs_df["matched"] = runs_df["matches"].fillna(0).astype(int) > 0
    filtered_runs_df = runs_df.dropna(subset=["timestamp_dt"]).copy()
    if WINDOW_DAYS and WINDOW_DAYS > 0:
        window_start = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=WINDOW_DAYS)
        filtered_runs_df = filtered_runs_df[filtered_runs_df["timestamp_dt"] >= window_start]
    if PATTERN_FILTER:
        filtered_runs_df = filtered_runs_df[filtered_runs_df["pattern"].fillna("").str.contains(PATTERN_FILTER, case=False, na=False)]
    filter_summary = {
        "total_runs_loaded": len(runs_df),
        "runs_after_filters": len(filtered_runs_df),
        "window_start": window_start.isoformat() if window_start else "—",
        "pattern_filter": PATTERN_FILTER or "—",
    }
    filtered_runs_df = filtered_runs_df.sort_values("timestamp_dt", ascending=False).copy()
    for column in ["duration_ms", "files_scanned", "matches"]:
        filtered_runs_df[column] = pd.to_numeric(filtered_runs_df[column], errors="coerce").fillna(0)
    display(pd.DataFrame([filter_summary]))

Unnamed: 0,total_runs_loaded,runs_after_filters,window_start,pattern_filter
0,1,1,2025-11-03T17:33:50.860086+00:00,—


## Run-level details
Focus on the filtered sweep set to spot hotspots and noisy presets quickly.

In [6]:
if filtered_runs_df.empty:
    display(pd.DataFrame({"message": ["No search telemetry matched the current filters."]}))
else:
    display(filtered_runs_df[["timestamp", "pattern", "preset", "matches", "files_scanned", "duration_ms"]].head(20))

Unnamed: 0,timestamp,pattern,preset,matches,files_scanned,duration_ms
0,2025-11-15T13:50:46.784818+00:00,GraphExecutor,,14,889,977


## Signal summary
Quick health report for the currently scoped sweeps.

In [7]:
if filtered_runs_df.empty:
    print("No telemetry to summarize yet.")
else:
    summary_frame = filtered_runs_df.copy()
    summary_frame["duration_ms"] = summary_frame["duration_ms"].fillna(0).astype(float)
    summary_frame["files_scanned"] = summary_frame["files_scanned"].fillna(0).astype(float)
    summary = {
        "runs_considered": len(summary_frame),
        "runs_with_matches": int(summary_frame["matched"].sum()),
        "match_rate_pct": round(summary_frame["matched"].mean() * 100, 1),
        "total_matches": int(summary_frame["matches"].fillna(0).sum()),
        "avg_duration_ms": round(summary_frame["duration_ms"].mean(), 1),
        "p95_duration_ms": round(summary_frame["duration_ms"].quantile(0.95), 1),
        "avg_files_scanned": round(summary_frame["files_scanned"].mean(), 1),
    }
    display(pd.DataFrame([summary]))

Unnamed: 0,runs_considered,runs_with_matches,match_rate_pct,total_matches,avg_duration_ms,p95_duration_ms,avg_files_scanned
0,1,1,100.0,14,977.0,977.0,889.0


## Preset quality board
Rank presets by how often they find issues vs. how long they take.

In [8]:
if filtered_runs_df.empty:
    print("Nothing to aggregate by preset yet.")
else:
    preset_summary = (
        filtered_runs_df.assign(
            preset=filtered_runs_df["preset"].fillna("custom/adhoc"),
            matches_safe=filtered_runs_df["matches"].fillna(0),
            files_scanned_safe=filtered_runs_df["files_scanned"].fillna(0),
        )
        .groupby("preset", as_index=False)
        .agg(
            runs=("preset", "size"),
            runs_with_matches=("matched", "sum"),
            total_matches=("matches_safe", "sum"),
            avg_duration_ms=("duration_ms", "mean"),
            avg_files_scanned=("files_scanned_safe", "mean"),
        )
    )
    preset_summary["match_rate_pct"] = (
        preset_summary["runs_with_matches"] / preset_summary["runs"] * 100
    ).round(1)
    display(preset_summary.sort_values(["total_matches", "runs"], ascending=False).head(10))
    fig_presets = px.bar(
        preset_summary.sort_values("total_matches", ascending=False).head(10),
        x="preset",
        y=["total_matches", "runs_with_matches"],
        barmode="group",
        title="Top presets by findings vs. successful sweeps",
    )
    fig_presets.update_layout(xaxis_title="Preset", yaxis_title="Count")
    fig_presets.show()

Unnamed: 0,preset,runs,runs_with_matches,total_matches,avg_duration_ms,avg_files_scanned,match_rate_pct
0,,1,1,14,977.0,889.0,100.0


## Preset drift watchlist
Track presets whose recent hit rate diverges from their historical baseline so owners can triage regressions quickly.

In [9]:
drift_rows = telemetry.compute_preset_drift(
    SEARCH_LEDGER_PATH,
    lookback=PRESET_DRIFT_LOOKBACK,
    preset_tags_path=PRESET_TAGS_PATH,
)
drift_df = pd.DataFrame(drift_rows)
if drift_df.empty:
    print("No preset drift stats yet. Ingest more runs or expand the lookback window.")
else:
    drift_df["tags_str"] = drift_df["tags"].apply(lambda tags: ", ".join(tags) if tags else "—")
    drift_df["match_rate_recent_pct"] = (drift_df["match_rate_recent"] * 100).round(1)
    drift_df["match_rate_lifetime_pct"] = (drift_df["match_rate_lifetime"] * 100).round(1)
    drift_df["delta_match_rate_pct"] = (drift_df["delta_match_rate"] * 100).round(1)
    drift_df["status_label"] = drift_df["status"].str.title()
    drift_df["delta_duration_ms"] = drift_df["delta_duration_ms"].round(1)
    watchlist_cols = [
        "preset",
        "status_label",
        "recent_runs",
        "match_rate_recent_pct",
        "match_rate_lifetime_pct",
        "delta_match_rate_pct",
        "delta_duration_ms",
        "tags_str",
    ]
    display(
        drift_df.sort_values("delta_match_rate")
        .head(12)[watchlist_cols]
        .rename(
            columns={
                "status_label": "Status",
                "preset": "Preset",
                "recent_runs": "Recent runs",
                "match_rate_recent_pct": "Recent match %",
                "match_rate_lifetime_pct": "Lifetime match %",
                "delta_match_rate_pct": "Δ match %",
                "delta_duration_ms": "Δ duration (ms)",
                "tags_str": "Tags",
            }
        )
        .reset_index(drop=True)
    )
    fig_drift = px.bar(
        drift_df.sort_values("delta_match_rate"),
        x="preset",
        y="delta_match_rate_pct",
        color="status_label",
        hover_data=["recent_runs", "match_rate_recent_pct", "match_rate_lifetime_pct", "tags_str"],
        text="delta_match_rate_pct",
        title=f"Preset match-rate deltas (last {PRESET_DRIFT_LOOKBACK} runs vs lifetime)",
    )
    fig_drift.update_layout(xaxis_title="Preset", yaxis_title="Δ match rate (%)")
    fig_drift.show()

No preset drift stats yet. Ingest more runs or expand the lookback window.


## Execution performance spotlight
Visualize durations vs. files scanned and list the slowest sweeps to triage instrumentation issues.

In [10]:
if filtered_runs_df.empty:
    print("No runs to visualize yet.")
else:
    recent_sample = filtered_runs_df.sort_values("timestamp_dt").tail(400).copy()
    recent_sample["size_proxy"] = recent_sample["files_scanned"].clip(lower=1)
    fig_duration = px.scatter(
        recent_sample,
        x="timestamp_dt",
        y="duration_ms",
        size="size_proxy",
        color="matches",
        title="Duration vs. files scanned (recent 400 runs)",
        hover_data=["pattern", "preset", "files_scanned", "matches"],
    )
    fig_duration.update_layout(xaxis_title="Timestamp (UTC)", yaxis_title="Duration (ms)")
    fig_duration.show()
    slowest = recent_sample.nlargest(10, "duration_ms")[["timestamp", "pattern", "preset", "duration_ms", "files_scanned", "matches"]]
    display(slowest.reset_index(drop=True))

Unnamed: 0,timestamp,pattern,preset,duration_ms,files_scanned,matches
0,2025-11-15T13:50:46.784818+00:00,GraphExecutor,,977,889,14


## Daily hygiene trends
Plot total sweeps, match counts, and density to highlight regressions or flaky presets.

In [11]:
daily_df = telemetry.load_daily_metrics(SEARCH_LEDGER_PATH)
if daily_df.empty:
    print("No telemetry ingested yet. Run a few searches and re-ingest.")
else:
    daily_df["event_date"] = pd.to_datetime(daily_df["event_date"], utc=True)
    if WINDOW_DAYS and WINDOW_DAYS > 0:
        cutoff = (pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=WINDOW_DAYS)).normalize()
        daily_df = daily_df[daily_df["event_date"] >= cutoff]
    daily_df["event_date"] = daily_df["event_date"].dt.tz_localize(None)
    display(daily_df)
    if not daily_df.empty:
        trend_df = daily_df.sort_values("event_date").copy()
        fig_runs = px.bar(
            trend_df,
            x="event_date",
            y=["runs", "runs_with_matches"],
            title="Daily sweep volume vs. findings",
            barmode="group",
        )
        fig_runs.update_layout(xaxis_title="Date", yaxis_title="Sweeps")
        fig_runs.show()
        trend_df["match_rate_pct"] = (
            trend_df["runs_with_matches"] / trend_df["runs"].where(trend_df["runs"] > 0)
        ).fillna(0) * 100
        trend_df["density_pct"] = trend_df["avg_match_density"].fillna(0) * 100
        fig_density = px.line(
            trend_df,
            x="event_date",
            y=["density_pct", "match_rate_pct"],
            title="Match density & success rate (%)",
        )
        fig_density.update_layout(xaxis_title="Date", yaxis_title="Percentage")
        fig_density.show()

Unnamed: 0,event_date,runs,files_scanned,matches,runs_with_matches,avg_duration_ms,avg_match_density
0,2025-11-15,1,889,14,1,977.0,0.015748
