In [1]:
import os, sys, json, random
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

import pandas as pd
import mlflow
from mlflow import MlflowClient

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels


In [2]:
key = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY is set:", bool(key))
print("Length:", len(key) if key else None)
print("Prefix:", (key[:7] + "...") if key else None)


OPENAI_API_KEY is set: True
Length: 164
Prefix: sk-proj...


In [3]:
from mlflow.genai import make_judge

JUDGE_NAME = "xr_config_quality_score"

gt_judge = make_judge(
    name=JUDGE_NAME,
    instructions=(
        "You are grading an IOS XR telemetry configuration.\n\n"
        "User request (inputs): {{ inputs }}\n\n"
        "Candidate config (outputs): {{ outputs }}\n\n"
        "Reference acceptable config (expectations): {{ expectations }}\n\n"
        "Score quality from 0.0 to 1.0.\n"
        "Hard requirements (must match): IP, port, transport (grpc no-tls), encoding.\n"
        "Be lenient about names, ordering, and sample-interval unless requested.\n"
        "Penalize only if sensor-paths are clearly unrelated.\n"
        "Do NOT output telemetry config. Do NOT output code. Keep any explanation extremely short."
    ),
    feedback_value_type=float,
    model="openai:/gpt-4.1-mini",
    inference_params={"temperature": 0, "max_tokens": 300},
)


In [4]:
@dataclass(frozen=True)
class RunCfg:
    vector_db: str
    top_k: int
    filter_fields: Dict[str, Any]
    temperature: float
    model_chat: str
    model_embed: str

def make_grid(
    vector_dbs: List[str],
    top_ks: List[int],
    filters: List[Dict[str, Any]],
    temps: List[float],
    chat_models: List[str],
    embed_models: List[str],
) -> List[RunCfg]:
    return [
        RunCfg(vdb, k, ff, t, cm, em)
        for vdb in vector_dbs
        for k in top_ks
        for ff in filters
        for t in temps
        for cm in chat_models
        for em in embed_models
    ]

VECTOR_DBS  = ["fixed_window_embeddings", "catalog_embeddings_improved"]
TOP_KS      = [5, 10]
FILTERS     = [{}]                 # e.g. [{"protocol_tag": "bgp"}]
TEMPS       = [0.0, 0.1]
CHAT_MODELS = ["gpt-4.1-nano", "gpt-4.1-mini"]
EMBED_MODELS= ["text-embedding-3-small"]

cfgs = make_grid(VECTOR_DBS, TOP_KS, FILTERS, TEMPS, CHAT_MODELS, EMBED_MODELS)
print("n_cfgs =", len(cfgs))
print("example cfg =", cfgs[0])


n_cfgs = 16
example cfg = RunCfg(vector_db='fixed_window_embeddings', top_k=5, filter_fields={}, temperature=0.0, model_chat='gpt-4.1-nano', model_embed='text-embedding-3-small')


In [6]:
ROOT = Path.cwd().resolve().parents[0]  # if cwd == repo/notebooks
sys.path.insert(0, str(ROOT / "src"))

from tracerag.rag.naive import naive_rag, build_openai_chat_fn
from tracerag.retrieval.qdrant import (
    QdrantRetrievalConfig,
    build_openai_embedding_fn,
    build_qdrant_retriever,
)

print("Imported tracerag modules OK.")


Imported tracerag modules OK.


In [7]:
def load_text(path: Path) -> str:
    return path.read_text(encoding="utf-8")

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows

SYSTEM_PROMPT_PATH = Path("../data/iosxr_prompt.txt")
DATASET_PATH = Path("../data/judge_dataset.jsonl")

SYSTEM_PROMPT = load_text(SYSTEM_PROMPT_PATH)
dataset = load_jsonl(DATASET_PATH)

print("System prompt length:", len(SYSTEM_PROMPT))
print("Loaded dataset rows:", len(dataset))
print("Dataset keys:", dataset[0].keys())


System prompt length: 1573
Loaded dataset rows: 100
Dataset keys: dict_keys(['prompt', 'completion'])


In [8]:
def build_qdrant_filter(filter_fields: Dict[str, Any]) -> Optional[qmodels.Filter]:
    if not filter_fields:
        return None
    must = [
        qmodels.FieldCondition(key=k, match=qmodels.MatchValue(value=v))
        for k, v in filter_fields.items()
    ]
    return qmodels.Filter(must=must)

def make_retriever(cfg: RunCfg, qdrant: QdrantClient, openai_client: OpenAI):
    embed_fn = build_openai_embedding_fn(openai_client, model=cfg.model_embed)
    q_filter = build_qdrant_filter(cfg.filter_fields)

    config = QdrantRetrievalConfig(
        collection_name=cfg.vector_db,
        top_k=cfg.top_k,
        query_filter=q_filter,
    )

    qdrant_retriever = build_qdrant_retriever(
        qdrant=qdrant,
        embedding_fn=embed_fn,
        config=config,
    )
    return lambda query, k: qdrant_retriever(query, top_k=k, query_filter=q_filter)

def make_chat_fn(cfg: RunCfg, openai_client: OpenAI):
    return build_openai_chat_fn(
        openai_client=openai_client,
        model=cfg.model_chat,
        temperature=cfg.temperature,
    )


In [9]:
def rag_predict_one(prompt: str, cfg: RunCfg, qdrant: QdrantClient, openai_client: OpenAI) -> str:
    retriever = make_retriever(cfg, qdrant, openai_client)
    chat_fn = make_chat_fn(cfg, openai_client)

    resp = naive_rag(
        user_query=prompt,
        retriever=retriever,
        chat_fn=chat_fn,
        system_prompt=SYSTEM_PROMPT,
        top_k=cfg.top_k,
        answer_instruction="Return only IOS XR telemetry configuration.",
    )
    return resp.answer

def build_eval_data(
    dataset_rows: List[Dict[str, Any]],
    cfg: RunCfg,
    qdrant: QdrantClient,
    openai_client: OpenAI,
    max_examples: Optional[int] = None,
) -> List[Dict[str, Any]]:
    n = len(dataset_rows) if max_examples is None else min(len(dataset_rows), max_examples)
    out = []
    for i in range(n):
        prompt = dataset_rows[i]["prompt"]
        reference = dataset_rows[i]["completion"]
        candidate = rag_predict_one(prompt, cfg, qdrant, openai_client)
        out.append({
            "inputs": {"prompt": prompt},
            "outputs": {"response": candidate},
            "expectations": {"expected_response": reference},
        })
    return out


In [11]:
def _safe_json(x):
    if x is None:
        return None
    if isinstance(x, (dict, list)):
        return x
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return None
        try:
            return json.loads(s)
        except Exception:
            return {"_raw": s}
    return {"_raw": str(x)}

def export_run_traces_full(run_id: str, judge_name: str = JUDGE_NAME) -> pd.DataFrame:
    """
    Per-example table for ONE mlflow run_id:
      prompt, candidate, expected, score, rationale, trace_id + run params.
    """
    client = MlflowClient()
    run = client.get_run(run_id)
    run_params = dict(run.data.params)

    # Find trace ids for this run (fast)
    traces_df = mlflow.search_traces(filter_string=f"trace.run_id = '{run_id}'")
    trace_ids = list(traces_df["trace_id"].values) if "trace_id" in traces_df.columns else []

    rows = []
    for trace_id in trace_ids:
        t = mlflow.get_trace(trace_id)

        req = _safe_json(getattr(t.data, "request", None))
        resp = _safe_json(getattr(t.data, "response", None))

        prompt = None
        expected = None
        candidate = None

        if isinstance(req, dict):
            inputs = req.get("inputs") or {}
            expectations = req.get("expectations") or {}
            prompt = inputs.get("prompt") or req.get("prompt")
            expected = expectations.get("expected_response") or expectations.get("reference")

        if isinstance(resp, dict):
            outputs = resp.get("outputs") or {}
            candidate = outputs.get("response") or resp.get("response")

        score = None
        rationale = None
        assessments = getattr(t.info, "assessments", None) or []
        for a in assessments:
            if getattr(a, "assessment_name", None) == judge_name:
                fb = getattr(a, "feedback", None)
                score = getattr(fb, "value", None) if fb is not None else None
                rationale = getattr(a, "rationale", None)

        rows.append({
            "run_id": run_id,
            "trace_id": trace_id,
            "prompt": prompt,
            "candidate": candidate,
            "expected": expected,
            "score": score,
            "rationale": rationale,
            **{f"param.{k}": v for k, v in run_params.items()},
        })

    df = pd.DataFrame(rows)
    if "score" in df.columns:
        df["score"] = pd.to_numeric(df["score"], errors="coerce")
    return df


In [12]:
def log_cfg_params(cfg: RunCfg):
    mlflow.log_params({
        "vector_db": cfg.vector_db,
        "top_k": cfg.top_k,
        "filter_fields": json.dumps(cfg.filter_fields, sort_keys=True),
        "temperature": cfg.temperature,
        "model_chat": cfg.model_chat,
        "model_embed": cfg.model_embed,
    })

def log_eval_aggregates(df: pd.DataFrame):
    mlflow.log_metric("judge_mean", float(df["score"].mean()))
    mlflow.log_metric("judge_min", float(df["score"].min()))
    mlflow.log_metric("judge_pass_rate_ge_0.8", float((df["score"] >= 0.8).mean()))
    mlflow.log_metric("n_examples", int(df["score"].notna().sum()))

def run_one_cfg_mlflow(
    cfg: RunCfg,
    dataset_rows: List[Dict[str, Any]],
    qdrant: QdrantClient,
    openai_client: OpenAI,
    max_examples: Optional[int] = None,
):
    log_cfg_params(cfg)

    eval_data = build_eval_data(dataset_rows, cfg, qdrant, openai_client, max_examples=max_examples)

    # This produces traces + assessments
    _ = mlflow.genai.evaluate(data=eval_data, scorers=[gt_judge])

    run_id = mlflow.active_run().info.run_id

    # Extract full per-example evidence
    df = export_run_traces_full(run_id, judge_name=JUDGE_NAME)

    # Aggregate metrics in the run
    log_eval_aggregates(df)

    # Persist per-example table
    out_csv = "per_example_eval.csv"
    df.to_csv(out_csv, index=False)
    mlflow.log_artifact(out_csv)

    # Nice UI table (if supported by your MLflow version)
    # mlflow.log_table(df, "per_example_eval.json")

    return df
