In [1]:
# If needed:
# pip install -U mlflow mlflow[genai] openai litellm qdrant-client pandas

import os
key = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY is set:", bool(key))
print("Length:", len(key) if key else None)
print("Prefix:", (key[:7] + "...") if key else None)


OPENAI_API_KEY is set: True
Length: 164
Prefix: sk-proj...


In [None]:
from pathlib import Path
import mlflow

TRACKING_DIR = (Path.cwd() / "mlruns").resolve()
mlflow.set_tracking_uri("file://" + str(TRACKING_DIR))

EXPERIMENT_NAME = "xr_rag_llm_judge"
mlflow.set_experiment(EXPERIMENT_NAME)

print("Tracking:", mlflow.get_tracking_uri())
print("Experiment:", EXPERIMENT_NAME)


Tracking: file:///home/musel/Documents/github/TRACE/notebooks/mlruns
Experiment: xr_rag_llm_judge_sweep


  return FileStore(store_uri, store_uri)


In [3]:
from mlflow.genai import make_judge

JUDGE_NAME = "xr_config_quality_score"

gt_judge = make_judge(
    name=JUDGE_NAME,
    instructions=(
        "You are grading an IOS XR telemetry configuration.\n\n"
        "User request (inputs): {{ inputs }}\n\n"
        "Candidate config (outputs): {{ outputs }}\n\n"
        "Reference acceptable config (expectations): {{ expectations }}\n\n"
        "Score quality from 0.0 to 1.0.\n"
        "Hard requirements (must match): IP, port, transport (grpc no-tls), encoding.\n"
        "Be lenient about names, ordering, and sample-interval unless requested.\n"
        "Penalize only if sensor-paths are clearly unrelated.\n"
        "Do NOT output telemetry config. Do NOT output code. Keep any explanation extremely short."
    ),
    feedback_value_type=float,
    model="openai:/gpt-4.1-mini",
    inference_params={"temperature": 0, "max_tokens": 300},
)


In [4]:
from dataclasses import dataclass
from typing import Any, Dict, List

@dataclass(frozen=True)
class RunCfg:
    vector_db: str
    top_k: int
    filter_fields: Dict[str, Any]
    temperature: float
    model_chat: str
    model_embed: str

def make_grid(
    vector_dbs: List[str],
    top_ks: List[int],
    filters: List[Dict[str, Any]],
    temps: List[float],
    chat_models: List[str],
    embed_models: List[str],
) -> List[RunCfg]:
    out: List[RunCfg] = []
    for vdb in vector_dbs:
        for k in top_ks:
            for ff in filters:
                for t in temps:
                    for cm in chat_models:
                        for em in embed_models:
                            out.append(RunCfg(vdb, k, ff, t, cm, em))
    return out

VECTOR_DBS  = ["fixed_window_embeddings", "catalog_embeddings_improved"]
TOP_KS      = [5]
TEMPS       = [0.0]
CHAT_MODELS = ["gpt-4.1-nano", "gpt-4.1-mini"]
EMBED_MODELS = ["text-embedding-3-small"]
FILTERS     = [{}]  # add your payload filters later if needed

cfgs = make_grid(VECTOR_DBS, TOP_KS, FILTERS, TEMPS, CHAT_MODELS, EMBED_MODELS)
len(cfgs), cfgs[0]


(4,
 RunCfg(vector_db='fixed_window_embeddings', top_k=5, filter_fields={}, temperature=0.0, model_chat='gpt-4.1-nano', model_embed='text-embedding-3-small'))

In [5]:
import sys
import inspect
from pathlib import Path

ROOT = Path.cwd().resolve().parents[0]  # adjust if needed
sys.path.insert(0, str(ROOT / "src"))

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels

from tracerag.rag.naive import naive_rag, build_openai_chat_fn
from tracerag.retrieval.qdrant import (
    QdrantRetrievalConfig,
    build_openai_embedding_fn,
    build_qdrant_retriever,
)

def build_qdrant_filter(filter_fields: Dict[str, Any]) -> qmodels.Filter | None:
    if not filter_fields:
        return None
    must = []
    for k, v in filter_fields.items():
        must.append(qmodels.FieldCondition(key=k, match=qmodels.MatchValue(value=v)))
    return qmodels.Filter(must=must)

def make_retriever(*, cfg: RunCfg, qdrant: QdrantClient, openai_client: OpenAI):
    embed_fn = build_openai_embedding_fn(openai_client, model=cfg.model_embed)
    q_filter = build_qdrant_filter(cfg.filter_fields)

    config = QdrantRetrievalConfig(
        collection_name=cfg.vector_db,
        top_k=cfg.top_k,
        query_filter=q_filter,
    )

    qdrant_retriever = build_qdrant_retriever(
        qdrant=qdrant,
        embedding_fn=embed_fn,
        config=config,
    )

    return lambda query, k: qdrant_retriever(query, top_k=k, query_filter=q_filter)

def make_chat_fn(*, cfg: RunCfg, openai_client: OpenAI):
    sig = inspect.signature(build_openai_chat_fn)
    kwargs = {"model": cfg.model_chat, "temperature": cfg.temperature}

    if "openai_client" in sig.parameters:
        return build_openai_chat_fn(openai_client=openai_client, **kwargs)
    if "client" in sig.parameters:
        return build_openai_chat_fn(client=openai_client, **kwargs)

    # last resort: positional client + kwargs
    return build_openai_chat_fn(openai_client, **kwargs)

def rag_predict_one(
    *,
    prompt: str,
    cfg: RunCfg,
    qdrant,
    openai_client,
    system_prompt: str,
) -> str:
    retriever = make_retriever(cfg=cfg, qdrant=qdrant, openai_client=openai_client)
    chat_fn = make_chat_fn(cfg=cfg, openai_client=openai_client)

    resp = naive_rag(
        user_query=prompt,
        retriever=retriever,
        chat_fn=chat_fn,
        system_prompt=system_prompt,
        top_k=cfg.top_k,
        answer_instruction="Return only IOS XR telemetry configuration.",
    )
    return resp.answer



In [6]:
import json
from typing import Any, Dict, List

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

DATASET_PATH = Path("../data/judge_dataset.jsonl")  # <- change if needed
dataset_rows = load_jsonl(DATASET_PATH)
print("Loaded:", len(dataset_rows), "rows")
print("Keys:", dataset_rows[0].keys())

# Prompt variants: (name, system_prompt)
BASE_SYSTEM_PROMPT = ""
PROMPT_ENGINEERING = Path("../data/iosxr_prompt.txt").read_text(encoding="utf-8")
PROMPT_VARIANTS = [
    ("base", BASE_SYSTEM_PROMPT),
    # Add more variants (different style, stricter rules, shorter, etc.)
    ("strict", BASE_SYSTEM_PROMPT + PROMPT_ENGINEERING),
]


Loaded: 100 rows
Keys: dict_keys(['prompt', 'completion'])


In [7]:
from typing import Optional, List, Dict, Any

def build_eval_data(
    dataset_rows: List[Dict[str, Any]],
    cfg: RunCfg,
    qdrant,
    openai_client,
    system_prompt: str,
    max_examples: Optional[int] = None,
):
    n = len(dataset_rows) if max_examples is None else min(len(dataset_rows), max_examples)
    eval_data = []

    for i in range(n):
        prompt = dataset_rows[i]["prompt"]
        reference = dataset_rows[i]["completion"]

        candidate = rag_predict_one(
            prompt=prompt,
            cfg=cfg,
            qdrant=qdrant,
            openai_client=openai_client,
            system_prompt=system_prompt,
        )

        eval_data.append({
            "inputs": {"prompt": prompt},
            "outputs": candidate,  # keep it a plain string
            "expectations": {"expected_response": reference},  # ✅ must be dict
        })

    return eval_data


In [8]:
import json
import pandas as pd
from typing import Any, Dict, List

def _maybe_json(x: Any) -> Any:
    if isinstance(x, str):
        s = x.strip()
        if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
            try:
                return json.loads(s)
            except Exception:
                return x
    return x

def traces_to_eval_df(traces: Any, judge_name: str) -> pd.DataFrame:
    tdf = traces if isinstance(traces, pd.DataFrame) else pd.DataFrame(traces)
    out_rows: List[Dict[str, Any]] = []

    for _, r in tdf.iterrows():
        trace_id = r.get("trace_id")

        req = _maybe_json(r.get("request"))
        resp = _maybe_json(r.get("response"))
        exps = _maybe_json(r.get("expectations"))  # <-- IMPORTANT
        assessments = _maybe_json(r.get("assessments"))

        # ---- prompt ----
        prompt = None
        if isinstance(req, dict):
            prompt = req.get("prompt") or (req.get("inputs") or {}).get("prompt")

        # ---- expected ----
        expected = None
        # 1) best: dedicated expectations column
        if isinstance(exps, dict):
            expected = exps.get("expected_response") or exps.get("reference") or exps.get("expected")
        elif isinstance(exps, str):
            expected = exps

        # 2) fallback: some MLflow variants tuck it into request
        if expected is None and isinstance(req, dict):
            exp_obj = req.get("expectations")
            if isinstance(exp_obj, dict):
                expected = exp_obj.get("expected_response") or exp_obj.get("reference") or exp_obj.get("expected")

        # 3) fallback: sometimes flattened column exists
        if expected is None:
            expected = r.get("expected_response")

        # ---- candidate ----
        candidate = None
        if isinstance(resp, str):
            candidate = resp
        elif isinstance(resp, dict):
            candidate = resp.get("outputs") or resp.get("response") or resp.get("output") or resp.get("text")
            if isinstance(candidate, dict):
                candidate = candidate.get("response") or candidate.get("text") or str(candidate)

        # ---- judge score + rationale ----
        score = None
        rationale = None
        if isinstance(assessments, list):
            for a in assessments:
                if not isinstance(a, dict):
                    continue
                name = a.get("assessment_name") or a.get("name")
                if name == judge_name:
                    fb = a.get("feedback") or {}
                    score = fb.get("value")
                    rationale = a.get("rationale") or a.get("explanation")
                    break

        out_rows.append({
            "trace_id": trace_id,
            "prompt": prompt,
            "expected": expected,
            "candidate": candidate,
            "score": score,
            "rationale": rationale,
        })

    df = pd.DataFrame(out_rows, columns=["trace_id","prompt","expected","candidate","score","rationale"])
    df["score"] = pd.to_numeric(df["score"], errors="coerce")
    return df


In [9]:
import json
import tempfile
from pathlib import Path
import mlflow

def run_one_cfg_mlflow(
    *,
    cfg,
    dataset_rows,
    qdrant,
    openai_client,
    system_prompt_name: str,
    system_prompt: str,
    max_examples=None,
):
    mlflow.log_params({
        "vector_db": cfg.vector_db,
        "top_k": cfg.top_k,
        "filter_fields": json.dumps(cfg.filter_fields, sort_keys=True),
        "temperature": cfg.temperature,
        "model_chat": cfg.model_chat,
        "model_embed": cfg.model_embed,
        "system_prompt_variant": system_prompt_name,
    })

    eval_data = build_eval_data(
        dataset_rows=dataset_rows,
        cfg=cfg,
        qdrant=qdrant,
        openai_client=openai_client,
        system_prompt=system_prompt,
        max_examples=max_examples,
    )

    results = mlflow.genai.evaluate(data=eval_data, scorers=[gt_judge])

    eval_run_id = results.run_id
    mlflow.set_tag("eval_run_id", eval_run_id)

    traces = mlflow.search_traces(run_id=eval_run_id)
    df = traces_to_eval_df(traces, judge_name=JUDGE_NAME)

    # attach run context columns
    df["system_prompt_variant"] = system_prompt_name
    df["vector_db"] = cfg.vector_db
    df["top_k"] = cfg.top_k
    df["temperature"] = cfg.temperature
    df["model_chat"] = cfg.model_chat
    df["model_embed"] = cfg.model_embed

    # --- aggregate metrics (defensive) ---
    if (not df.empty) and ("score" in df.columns) and df["score"].notna().any():
        mlflow.log_metric("judge_mean", float(df["score"].mean()))
        mlflow.log_metric("judge_min", float(df["score"].min()))
        mlflow.log_metric("judge_pass_rate_ge_0.8", float((df["score"] >= 0.8).mean()))
        mlflow.log_metric("n_examples", int(df["score"].notna().sum()))
    else:
        mlflow.log_metric("n_examples", int(len(df)))
        mlflow.log_metric("judge_mean", 0.0)

    # --- export artifact per run (Option A) ---
    with tempfile.TemporaryDirectory() as td:
        p = Path(td) / "per_example_eval.csv"
        df.to_csv(p, index=False)
        mlflow.log_artifact(str(p))
        if hasattr(mlflow, "log_table"):
            mlflow.log_table(df, "per_example_eval_table.json")

    return df, results


In [10]:
# Optional: small sample for quick test first
import random
random.seed(42)
dataset_rows = random.sample(dataset_rows, 2)

In [11]:
# Connect clients once
qdrant = QdrantClient(host="localhost", port=6333)
openai_client = OpenAI()



with mlflow.start_run(run_name="parent_sweep") as parent:
    for (pname, psys) in PROMPT_VARIANTS:
        for cfg in cfgs:
            run_name = f"{pname}|{cfg.vector_db}|k={cfg.top_k}|t={cfg.temperature}|chat={cfg.model_chat}|emb={cfg.model_embed}"
            with mlflow.start_run(run_name=run_name, nested=True):
                df, _ = run_one_cfg_mlflow(
                    cfg=cfg,
                    dataset_rows=dataset_rows,
                    qdrant=qdrant,
                    openai_client=openai_client,
                    system_prompt_name=pname,
                    system_prompt=psys,
                    max_examples=50,  # bump/remove later
                )
                print("DONE:", run_name, "mean=", df["score"].mean())


  from .autonotebook import tqdm as notebook_tqdm
2026/01/12 17:24:14 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
  PydanticSerializationUnexpectedValue(Expected 10 fields but got 6: Expected `Message` - serialized value may not be as expected [field_name='message', input_value=Message(content='{"result...: None}, annotations=[]), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [field_name='choices', input_value=Choices(finish_reason='st...ider_specific_fields={}), input_type=Choices])
  return self.__pydantic_serializer__.to_python(
Evaluating: 100%|██████████| 2/2 [Elapsed: 00:05, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mbase|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small[0m
  Run ID: [94m63199c3fbc504d10b7029dc551bfb8b8[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: base|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:04, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mbase|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small[0m
  Run ID: [94m69b4515bfd5a4bfbbcf141b297c80e8d[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: base|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:02, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mbase|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small[0m
  Run ID: [94m435d6efb7cdf4090965c123817fa85f8[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: base|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:03, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mbase|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small[0m
  Run ID: [94m197dd2a4db1f48e6afad445f47ce55a1[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: base|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:03, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mstrict|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small[0m
  Run ID: [94m6dfb9aa9bde24523965b569b80be93e7[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: strict|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:06, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mstrict|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small[0m
  Run ID: [94m2c74e28ef3ae4d2a8689fa691704d83a[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: strict|fixed_window_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:05, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mstrict|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small[0m
  Run ID: [94m1ef3ff5615ab481fa383502b665bdd99[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: strict|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-nano|emb=text-embedding-3-small mean= nan


Evaluating: 100%|██████████| 2/2 [Elapsed: 00:03, Remaining: 00:00] 


✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mstrict|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small[0m
  Run ID: [94m75c587858989407e9729f606cd4bc99c[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: strict|catalog_embeddings_improved|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small mean= nan





In [12]:
import pandas as pd
import mlflow

exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
runs_df = mlflow.search_runs(experiment_ids=[exp.experiment_id], output_format="pandas")
child = runs_df[runs_df["tags.mlflow.runName"] != "parent_sweep"].copy()

all_dfs = []
for _, rr in child.iterrows():
    eval_run_id = rr.get("tags.eval_run_id")
    if pd.isna(eval_run_id) or eval_run_id is None:
        # fallback if tag missing
        eval_run_id = rr["run_id"]

    traces = mlflow.search_traces(run_id=str(eval_run_id))
    df = traces_to_eval_df(traces, judge_name=JUDGE_NAME)

    # attach run metadata you care about
    df["run_id"] = rr["run_id"]
    df["run_name"] = rr.get("tags.mlflow.runName")
    df["vector_db"] = rr.get("params.vector_db")
    df["top_k"] = rr.get("params.top_k")
    df["temperature"] = rr.get("params.temperature")
    df["model_chat"] = rr.get("params.model_chat")
    df["system_prompt_variant"] = rr.get("params.system_prompt_variant")

    all_dfs.append(df)

per_example_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
per_example_all.to_csv("per_example_all_runs_from_traces.csv", index=False)
print("Wrote per_example_all_runs_from_traces.csv rows:", len(per_example_all))


Wrote per_example_all_runs_from_traces.csv rows: 28
