In [20]:
import os

key = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY is set:", bool(key))
print("Length:", len(key) if key else None)
print("Prefix:", key[:7] + "..." if key else None)  # optional: shows only first chars


OPENAI_API_KEY is set: True
Length: 164
Prefix: sk-proj...


In [39]:
from mlflow.genai.judges import make_judge

gt_judge = make_judge(
    name="xr_config_quality_score",
    instructions=(
        "You are evaluating IOS XR telemetry config generation.\n\n"
        "inputs:\n{{ inputs }}\n\n"
        "output:\n{{ outputs }}\n\n"
        "expected:\n{{ expectations }}\n\n"
        "Return ONLY a single floating point number between 0.0 and 1.0.\n"
        "No words, no labels, no JSON, no explanation.\n\n"
        "Scoring rubric (lenient):\n"
        "- Ignore group/subscription names and ordering.\n"
        "- Ignore sample-interval unless user explicitly requested it.\n"
        "- Be strict about user-specified IP/port/protocol/encoding.\n"
        "- Penalize sensor-paths only if clearly unrelated.\n"
    ),
    feedback_value_type=float,
    model="openai:/gpt-4.1-mini",
    inference_params={"temperature": 0, "max_tokens": 10},
)


In [40]:
from dataclasses import dataclass
from typing import Any, Dict

@dataclass(frozen=True)
class RunCfg:
    vector_db: str
    top_k: int
    filter_fields: Dict[str, Any]
    temperature: float
    model_chat: str
    model_embed: str


In [41]:
def make_grid(vector_dbs, top_ks, filters, temps, chat_models, embed_models):
    cfgs = []
    for vdb in vector_dbs:
        for k in top_ks:
            for ff in filters:
                for t in temps:
                    for cm in chat_models:
                        for em in embed_models:
                            cfgs.append(RunCfg(vdb, k, ff, t, cm, em))
    return cfgs

VECTOR_DBS = ["catalog_embeddings", "fixed_window_embeddings"]  # add your 3rd
TOP_KS = [5, 10]
FILTERS = [{}]  # adjust to your payload schema,  {"protocol_tag": "bgp"}
TEMPS = [0.1]
CHAT_MODELS = ["gpt-4.1-mini"]
EMBED_MODELS = ["text-embedding-3-small"]

cfgs = make_grid(VECTOR_DBS, TOP_KS, FILTERS, TEMPS, CHAT_MODELS, EMBED_MODELS)
len(cfgs), cfgs[0]


(4,
 RunCfg(vector_db='catalog_embeddings', top_k=5, filter_fields={}, temperature=0.1, model_chat='gpt-4.1-mini', model_embed='text-embedding-3-small'))

In [42]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve().parents[0]  # repo root if cwd==repo/notebooks
sys.path.insert(0, str(ROOT / "src"))

In [43]:
from tracerag.rag.naive import naive_rag, build_openai_chat_fn
from tracerag.retrieval.qdrant import (
    QdrantRetrievalConfig,
    build_openai_embedding_fn,
    build_qdrant_retriever,
)

print("Imported tracerag modules OK.")


Imported tracerag modules OK.


In [44]:
@dataclass(frozen=True)
class RunCfg:
    vector_db: str                 # we'll map this to a Qdrant collection (or backend)
    top_k: int
    filter_fields: Dict[str, Any]  # e.g., {"domain":"bgp"} or {}
    temperature: float
    model_chat: str
    model_embed: str


In [45]:

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels

from typing import Any, Dict, List, Optional, Sequence, Tuple


In [46]:
def build_qdrant_filter(filter_fields: Dict[str, Any]) -> Optional[qmodels.Filter]:
    if not filter_fields:
        return None

    must: List[qmodels.FieldCondition] = []
    for key, value in filter_fields.items():
        must.append(
            qmodels.FieldCondition(
                key=key,
                match=qmodels.MatchValue(value=value),
            )
        )
    return qmodels.Filter(must=must)


In [47]:
def make_retriever(
    *,
    cfg: RunCfg,
    qdrant: QdrantClient,
    openai_client: OpenAI,
):
    embed_fn = build_openai_embedding_fn(openai_client, model=cfg.model_embed)

    q_filter = build_qdrant_filter(cfg.filter_fields)

    config = QdrantRetrievalConfig(
        collection_name=cfg.vector_db,  # <-- mapping: vector_db -> collection
        top_k=cfg.top_k,
        query_filter=q_filter,
    )

    qdrant_retriever = build_qdrant_retriever(
        qdrant=qdrant,
        embedding_fn=embed_fn,
        config=config,
    )

    # Return a function: retriever(query, k) -> List[Chunk]
    # Keep filter fixed per cfg (since filter_fields is a variation)
    return lambda query, k: qdrant_retriever(query, top_k=k, query_filter=q_filter)


def make_chat_fn(cfg: RunCfg, openai_client: OpenAI):
    return build_openai_chat_fn(
        openai_client,
        model=cfg.model_chat,
        temperature=cfg.temperature,
    )


In [48]:
SYSTEM_PROMPT_PATH = Path("../data/iosxr_prompt.txt")  # <-- your file
SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")

print("System prompt length:", len(SYSTEM_PROMPT))
print(SYSTEM_PROMPT[:300])


System prompt length: 1573
You are a Cisco IOS XR network engineer generating IOS XR 7.x model-driven telemetry configuration.

INPUTS:
- USER_REQUEST: describes the intent (e.g., “BGP telemetry”), destination IP/port, and optionally interval.
- CONTEXT: a list of valid YANG sensor-path candidates.

HARD RULES:
- Output ONLY 


In [49]:
def rag_predict_one(prompt: str, cfg: RunCfg, qdrant, openai_client) -> str:
    retriever = make_retriever(cfg=cfg, qdrant=qdrant, openai_client=openai_client)
    chat_fn = make_chat_fn(cfg, openai_client=openai_client)

    resp = naive_rag(
        user_query=prompt,
        retriever=retriever,
        chat_fn=chat_fn,
        system_prompt=SYSTEM_PROMPT,
        top_k=cfg.top_k,
        answer_instruction="Return only IOS XR telemetry configuration.",
    )
    return resp.answer


In [50]:
def build_eval_data(dataset_rows, cfg: RunCfg, qdrant, openai_client, max_examples=None):
    n = len(dataset_rows) if max_examples is None else min(len(dataset_rows), max_examples)
    eval_data = []

    for i in range(n):
        prompt = dataset_rows[i]["prompt"]
        reference = dataset_rows[i]["completion"]

        candidate = rag_predict_one(prompt, cfg, qdrant, openai_client)

        eval_data.append({
            "inputs": {"prompt": prompt},
            "outputs": candidate,
            "expectations": {"expected_response": reference},
        })
    return eval_data


In [51]:
import json
import mlflow

def run_one_cfg_mlflow(cfg: RunCfg, dataset_rows, qdrant, openai_client, max_examples=None):
    # log only your 6 variations
    mlflow.log_params({
        "vector_db": cfg.vector_db,
        "top_k": cfg.top_k,
        "filter_fields": json.dumps(cfg.filter_fields, sort_keys=True),
        "temperature": cfg.temperature,
        "model_chat": cfg.model_chat,
        "model_embed": cfg.model_embed,
    })

    eval_data = build_eval_data(dataset_rows, cfg, qdrant, openai_client, max_examples=max_examples)

    # MLflow judge
    results = mlflow.genai.evaluate(
        data=eval_data,
        scorers=[gt_judge],
    )

    # results.metrics usually contains aggregate metrics; but easiest is to also compute pass rate from the table
    # MLflow returns an EvaluationResult with a "tables" field in many setups.
    # We'll be defensive and compute ourselves:
    # Each row's result is stored in results.tables["evaluation_results"] or similar depending on version.
    return results


In [52]:
DATASET_PATH = Path("../data/judge_dataset.jsonl")  # <-- change if needed

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

dataset = load_jsonl(DATASET_PATH)
print("Loaded rows:", len(dataset))
print("Example keys:", dataset[0].keys())
print("Prompt preview:\n", dataset[0]["prompt"][:200])
print("Completion preview:\n", dataset[0]["completion"][:200])


Loaded rows: 10
Example keys: dict_keys(['prompt', 'completion'])
Prompt preview:
 Generate Cisco IOS XR 7.0.1 telemetry configuration to monitor L2VPN xconnect/pseudowire operational state. Use gRPC with no TLS. Telemetry server address is 192.0.2.110 with port 57500. Choose releva
Completion preview:
 telemetry model-driven
 sensor-group L2VPN-XCONNECT-OPER
 sensor-path Cisco-IOS-XR-l2vpn-oper:l2vpnv2/active/xconnects
 sensor-path Cisco-IOS-XR-l2vpn-oper:l2vpnv2/active/pseudowires
 sensor-path Cisc


In [53]:
import random

random.seed(42)
dataset = random.sample(dataset, 10)

In [54]:
dataset

[{'prompt': 'Create telemetry model-driven configuration for Cisco IOS XR 7.0.1 to stream routing information base (RIB) summary and route counters (IPv4/IPv6). Use gRPC with no TLS. Telemetry server is 192.0.2.111 port 57500. Choose relevant sensor-paths.',
  'completion': 'telemetry model-driven\n sensor-group RIB-OPER\n sensor-path Cisco-IOS-XR-rib-oper:rib/vrfs/vrf/afi-safi/ipv4-unicast\n sensor-path Cisco-IOS-XR-rib-oper:rib/vrfs/vrf/afi-safi/ipv6-unicast\n sensor-path Cisco-IOS-XR-rib-oper:rib/summary\n !\n destination-group DG-RIB\n address-family ipv4\n destination 192.0.2.111\n port 57500\n encoding self-describing-gpb\n protocol grpc no-tls\n !\n subscription SUB-RIB\n sensor-group-id RIB-OPER sample-interval 60000\n destination-group-id DG-RIB\n !\n!'},
 {'prompt': 'Generate Cisco IOS XR 7.0.1 telemetry configuration to monitor L2VPN xconnect/pseudowire operational state. Use gRPC with no TLS. Telemetry server address is 192.0.2.110 with port 57500. Choose relevant sensor-pa

In [55]:
from qdrant_client import QdrantClient
from openai import OpenAI

mlflow.set_tracking_uri("file://" + str((Path.cwd() / "../mlruns").resolve()))
mlflow.set_experiment("xr_rag_variations_judged")

qdrant = QdrantClient(host="localhost", port=6333)
openai_client = OpenAI()

with mlflow.start_run(run_name="parent_sweep") as parent:
    for cfg in cfgs:
        run_name = f"{cfg.vector_db}|k={cfg.top_k}|t={cfg.temperature}|chat={cfg.model_chat}|emb={cfg.model_embed}|f={cfg.filter_fields}"
        with mlflow.start_run(run_name=run_name, nested=True):
            results = run_one_cfg_mlflow(
                cfg=cfg,
                dataset_rows=dataset,
                qdrant=qdrant,
                openai_client=openai_client,
                max_examples=50,   # start small for iteration; remove later
            )
            print("DONE:", run_name)


Evaluating: 100%|██████████| 10/10 [Elapsed: 00:00, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mcatalog_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}[0m
  Run ID: [94m290b9c4438ea4f31916d95a4a9834a79[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: catalog_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}


Evaluating: 100%|██████████| 10/10 [Elapsed: 00:00, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mcatalog_embeddings|k=10|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}[0m
  Run ID: [94mea0571e2c29f4862b1cfea04aa7516a9[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: catalog_embeddings|k=10|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}


Evaluating: 100%|██████████| 10/10 [Elapsed: 00:00, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mfixed_window_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}[0m
  Run ID: [94m9281d1d9c1ea46efbf3c8eefaba91f08[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: fixed_window_embeddings|k=5|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}


Evaluating: 100%|██████████| 10/10 [Elapsed: 00:00, Remaining: 00:00] 



✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mfixed_window_embeddings|k=10|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}[0m
  Run ID: [94m902c62ed69144bef8d01d4e7ae7d486d[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

DONE: fixed_window_embeddings|k=10|t=0.1|chat=gpt-4.1-mini|emb=text-embedding-3-small|f={}


In [56]:
# from trl.experimental.judges import HfPairwiseJudge

# judge = HfPairwiseJudge()
# judge.judge(
#     prompts=["What is the capital of France?", "What is the biggest planet in the solar system?"],
#     completions=[["Paris", "Lyon"], ["Saturn", "Jupiter"]],
# )  # Outputs: [0, 1]

# mlflow ui --backend-store-uri file:./mlruns
