In [3]:
import os
import sys
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import mlflow

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels

# TRL judge
from trl.experimental.judges import HfPairwiseJudge

# ---- Make sure we can import your local project modules ----
ROOT = Path.cwd()
# if your notebook is in repo root, ROOT is correct; otherwise set ROOT manually
sys.path.insert(0, str(ROOT / "src"))

print("ROOT:", ROOT)
print("src exists:", (ROOT / "src").exists())


  from .autonotebook import tqdm as notebook_tqdm


ROOT: /home/musel/Documents/github/TRACE/notebooks
src exists: False


  from trl.experimental.judges import HfPairwiseJudge


In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve().parents[0]  # repo root if cwd==repo/notebooks
sys.path.insert(0, str(ROOT / "src"))

In [6]:
from tracerag.rag.naive import naive_rag, build_openai_chat_fn
from tracerag.retrieval.qdrant import (
    QdrantRetrievalConfig,
    build_openai_embedding_fn,
    build_qdrant_retriever,
)

print("Imported tracerag modules OK.")


Imported tracerag modules OK.


In [10]:
DATASET_PATH = Path("../data/judge_dataset.jsonl")  # <-- change if needed

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

dataset = load_jsonl(DATASET_PATH)
print("Loaded rows:", len(dataset))
print("Example keys:", dataset[0].keys())
print("Prompt preview:\n", dataset[0]["prompt"][:200])
print("Completion preview:\n", dataset[0]["completion"][:200])


Loaded rows: 2500
Example keys: dict_keys(['prompt', 'completion'])
Prompt preview:
 Can you generate telemetry configuration for cisco ios xr about border gateway protocol (BGP)? Use grpc with no tls, the telemetry server address is 192.0.2.100 with port 57500. Choose relevant sensor
Completion preview:
 telemetry model-driven
 sensor-group BGP-STATS
  sensor-path Cisco-IOS-XR-ipv4-bgp-oper:bgp/instances/instance/instance-active/default-vrf/sessions
  sensor-path Cisco-IOS-XR-ipv4-bgp-oper:bgp/instanc


In [23]:
import random

random.seed(42)
dataset = random.sample(dataset, 10)

In [24]:
SYSTEM_PROMPT_PATH = Path("../data/iosxr_prompt.txt")  # <-- your file
SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")

print("System prompt length:", len(SYSTEM_PROMPT))
print(SYSTEM_PROMPT[:300])


System prompt length: 1573
You are a Cisco IOS XR network engineer generating IOS XR 7.x model-driven telemetry configuration.

INPUTS:
- USER_REQUEST: describes the intent (e.g., “BGP telemetry”), destination IP/port, and optionally interval.
- CONTEXT: a list of valid YANG sensor-path candidates.

HARD RULES:
- Output ONLY 


In [25]:
@dataclass(frozen=True)
class RunCfg:
    vector_db: str                 # we'll map this to a Qdrant collection (or backend)
    top_k: int
    filter_fields: Dict[str, Any]  # e.g., {"domain":"bgp"} or {}
    temperature: float
    model_chat: str
    model_embed: str


In [26]:
def build_qdrant_filter(filter_fields: Dict[str, Any]) -> Optional[qmodels.Filter]:
    if not filter_fields:
        return None

    must: List[qmodels.FieldCondition] = []
    for key, value in filter_fields.items():
        must.append(
            qmodels.FieldCondition(
                key=key,
                match=qmodels.MatchValue(value=value),
            )
        )
    return qmodels.Filter(must=must)


In [27]:
def make_retriever(
    *,
    cfg: RunCfg,
    qdrant: QdrantClient,
    openai_client: OpenAI,
):
    embed_fn = build_openai_embedding_fn(openai_client, model=cfg.model_embed)

    q_filter = build_qdrant_filter(cfg.filter_fields)

    config = QdrantRetrievalConfig(
        collection_name=cfg.vector_db,  # <-- mapping: vector_db -> collection
        top_k=cfg.top_k,
        query_filter=q_filter,
    )

    qdrant_retriever = build_qdrant_retriever(
        qdrant=qdrant,
        embedding_fn=embed_fn,
        config=config,
    )

    # Return a function: retriever(query, k) -> List[Chunk]
    # Keep filter fixed per cfg (since filter_fields is a variation)
    return lambda query, k: qdrant_retriever(query, top_k=k, query_filter=q_filter)


def make_chat_fn(cfg: RunCfg, openai_client: OpenAI):
    return build_openai_chat_fn(
        openai_client,
        model=cfg.model_chat,
        temperature=cfg.temperature,
    )


In [28]:
ANSWER_INSTRUCTION = "Return only IOS XR telemetry configuration."

def generate_completion(
    *,
    prompt: str,
    cfg: RunCfg,
    qdrant: QdrantClient,
    openai_client: OpenAI,
) -> Dict[str, Any]:
    retriever = make_retriever(cfg=cfg, qdrant=qdrant, openai_client=openai_client)
    chat_fn = make_chat_fn(cfg, openai_client=openai_client)

    resp = naive_rag(
        user_query=prompt,
        retriever=retriever,
        chat_fn=chat_fn,
        system_prompt=SYSTEM_PROMPT,
        top_k=cfg.top_k,
        answer_instruction=ANSWER_INSTRUCTION,
    )

    # Return full trace (useful for MLflow artifacts)
    return {
        "answer": resp.answer,
        "context_preview": (resp.context or "")[:1200],
        "retrieved": [
            {
                "id": ch.id,
                "score": ch.score,
                "file_path": ch.file_path,
                "chunk_index": ch.chunk_index,
                # if catalog payload exists, keep these fields
                "module": (ch.payload or {}).get("module"),
                "path": (ch.payload or {}).get("path"),
                "domain": (ch.payload or {}).get("domain") or (ch.payload or {}).get("protocol_tag"),
                "category": (ch.payload or {}).get("category"),
            }
            for ch in (resp.chunks or [])
        ],
    }


In [29]:
judge = HfPairwiseJudge()

def judge_binary_score(prompt: str, candidate: str, reference: str) -> int:
    # returns 0 if candidate wins, 1 if reference wins
    winner_idx = judge.judge(
        prompts=[prompt],
        completions=[[candidate, reference]],
    )[0]

    # Convert to {0,1}: 1 means candidate wins
    return 1 if winner_idx == 0 else 0


In [30]:
def eval_cfg_and_log_mlflow(
    *,
    cfg: RunCfg,
    dataset: Sequence[Dict[str, Any]],
    qdrant: QdrantClient,
    openai_client: OpenAI,
    max_examples: Optional[int] = None,
):
    # --- log only your variations as params ---
    mlflow.log_params({
        "vector_db": cfg.vector_db,
        "top_k": cfg.top_k,
        "filter_fields": json.dumps(cfg.filter_fields, sort_keys=True),
        "temperature": cfg.temperature,
        "model_chat": cfg.model_chat,
        "model_embed": cfg.model_embed,
    })

    rows = []
    scores = []

    n = len(dataset) if max_examples is None else min(len(dataset), max_examples)

    for i in range(n):
        prompt = dataset[i]["prompt"]
        reference = dataset[i]["completion"]

        gen = generate_completion(
            prompt=prompt,
            cfg=cfg,
            qdrant=qdrant,
            openai_client=openai_client,
        )

        candidate = gen["answer"]
        score = judge_binary_score(prompt, candidate, reference)

        scores.append(score)
        rows.append({
            "i": i,
            "prompt": prompt,
            "reference": reference,
            "candidate": candidate,
            "judge_score": score,
            "retrieved": gen["retrieved"],
            "context_preview": gen["context_preview"],
        })

    mean_score = sum(scores) / max(1, len(scores))
    mlflow.log_metric("judge_mean", mean_score)
    mlflow.log_metric("n_examples", len(scores))

    # artifact
    out_dir = Path("mlflow_artifacts")
    out_dir.mkdir(exist_ok=True)
    out_path = out_dir / "traces.jsonl"
    with out_path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    mlflow.log_artifact(str(out_path))

    return mean_score


In [31]:
def make_grid(
    *,
    vector_dbs: Sequence[str],
    top_ks: Sequence[int],
    filter_fields_list: Sequence[Dict[str, Any]],
    temperatures: Sequence[float],
    model_chats: Sequence[str],
    model_embeds: Sequence[str],
) -> List[RunCfg]:
    cfgs = []
    for vdb in vector_dbs:
        for k in top_ks:
            for ff in filter_fields_list:
                for temp in temperatures:
                    for chat in model_chats:
                        for emb in model_embeds:
                            cfgs.append(RunCfg(
                                vector_db=vdb,
                                top_k=k,
                                filter_fields=ff,
                                temperature=temp,
                                model_chat=chat,
                                model_embed=emb,
                            ))
    return cfgs


# Example: fill these with YOUR options
VECTOR_DBS = [
    "catalog_embeddings",
    "fixed_window_embeddings",
    # "your_third_collection_or_db",
]

TOP_KS = [5, 10, 15]

FILTERS = [
    {},  # no filter
    {"protocol_tag": "bgp"},  # if that's what you have today
    # {"domain": "bgp"},  # after you add domain to catalog
    # {"module_family": "ipv4-bgp-oper"},  # if you store it
]

TEMPS = [0.0, 0.1, 0.2]

MODEL_CHATS = ["gpt-4.1-mini"]
MODEL_EMBEDS = ["text-embedding-3-small"]

cfgs = make_grid(
    vector_dbs=VECTOR_DBS,
    top_ks=TOP_KS,
    filter_fields_list=FILTERS,
    temperatures=TEMPS,
    model_chats=MODEL_CHATS,
    model_embeds=MODEL_EMBEDS,
)

len(cfgs), cfgs[0]


(36,
 RunCfg(vector_db='catalog_embeddings', top_k=5, filter_fields={}, temperature=0.0, model_chat='gpt-4.1-mini', model_embed='text-embedding-3-small'))

In [35]:
# Configure MLflow location (local)
mlflow.set_tracking_uri("file://" + str((Path.cwd() / "../mlruns").resolve()))
mlflow.set_experiment("xr_rag_judge_sweep")

# Clients
qdrant = QdrantClient(host="localhost", port=6333)  # adjust if cloud
openai_client = OpenAI()

with mlflow.start_run(run_name="parent_sweep") as parent:
    mlflow.set_tag("type", "parent")

    for cfg in cfgs:
        run_name = (
            f"{cfg.vector_db}|k={cfg.top_k}|t={cfg.temperature}|"
            f"chat={cfg.model_chat}|emb={cfg.model_embed}|f={json.dumps(cfg.filter_fields, sort_keys=True)}"
        )

        with mlflow.start_run(run_name=run_name, nested=True):
            mean_score = eval_cfg_and_log_mlflow(
                cfg=cfg,
                dataset=dataset,
                qdrant=qdrant,
                openai_client=openai_client,
                max_examples=None,  # or set e.g. 50 for faster iteration
            )
            print(run_name, "judge_mean =", mean_score)


  return FileStore(store_uri, store_uri)
2026/01/11 22:32:15 INFO mlflow.tracking.fluent: Experiment with name 'xr_rag_judge_sweep' does not exist. Creating a new experiment.


catalog_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={} judge_mean = 0.5
catalog_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={} judge_mean = 0.2
catalog_embeddings|k=5|t=0.2|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={} judge_mean = 0.4
catalog_embeddings|k=5|t=0.0|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={"protocol_tag": "bgp"} judge_mean = 0.3
catalog_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={"protocol_tag": "bgp"} judge_mean = 0.4
catalog_embeddings|k=5|t=0.2|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={"protocol_tag": "bgp"} judge_mean = 0.2


HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/novita/v3/openai/chat/completions (Request ID: Root=1-696417be-6652319b03bb1c4f02ef29bf;c0614f17-8bbd-4803-a67c-782f5ddb5fca)

You have reached the free monthly usage limit for novita. Subscribe to PRO to get 20x more included usage, or add pre-paid credits to your account.

mlflow ui --backend-store-uri file:./mlruns
