In [1]:
# Cell 1 — MLflow tracking + judge (same idea as your RAG code)

import os
from pathlib import Path
import mlflow
from mlflow.genai import make_judge

key = os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY is set:", bool(key))
print("Length:", len(key) if key else None)
print("Prefix:", (key[:7] + "...") if key else None)

TRACKING_DIR = (Path.cwd() / "mlruns").resolve()
mlflow.set_tracking_uri("file://" + str(TRACKING_DIR))

EXPERIMENT_NAME = "xr_hf_llm_judge_sweep"
mlflow.set_experiment(EXPERIMENT_NAME)

print("Tracking:", mlflow.get_tracking_uri())
print("Experiment:", EXPERIMENT_NAME)

JUDGE_NAME = "xr_config_quality_score"

gt_judge = make_judge(
    name=JUDGE_NAME,
    instructions=(
        "You are grading an IOS XR telemetry configuration.\n\n"
        "User request (inputs): {{ inputs }}\n\n"
        "Candidate config (outputs): {{ outputs }}\n\n"
        "Reference acceptable config (expectations): {{ expectations }}\n\n"
        "Score quality from 0.0 to 1.0.\n"
        "Hard requirements (must match): IP, port, transport (grpc no-tls), encoding.\n"
        "Be lenient about names, ordering, and sample-interval unless requested.\n"
        "Penalize only if sensor-paths are clearly unrelated.\n"
        "Do NOT output telemetry config. Do NOT output code. Keep any explanation extremely short."
    ),
    feedback_value_type=float,
    model="openai:/gpt-4.1-mini",
    inference_params={"temperature": 0, "max_tokens": 300},
)


OPENAI_API_KEY is set: True
Length: 164
Prefix: sk-proj...
Tracking: file:///home/musel/Documents/github/TRACE/notebooks/mlruns
Experiment: xr_hf_llm_judge_sweep


  return FileStore(store_uri, store_uri)


In [2]:
# Cell 2 — load your dataset JSONL (prompt + completion)

import json
from typing import Any, Dict, List
from pathlib import Path

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            rows.append(json.loads(s))
    return rows

DATASET_PATH = Path("../data/judge_dataset.jsonl")  # adjust
dataset_rows = load_jsonl(DATASET_PATH)
print("Loaded:", len(dataset_rows), "rows")
print("Keys:", dataset_rows[0].keys())


Loaded: 100 rows
Keys: dict_keys(['prompt', 'completion'])


In [3]:
# Cell 3 — system prompt variants (sweep these)

BASE_SYSTEM_PROMPT = ""
PROMPT_ENGINEERING_PATH = Path("../data/iosxr_prompt.txt")  # adjust
PROMPT_ENGINEERING = PROMPT_ENGINEERING_PATH.read_text(encoding="utf-8") if PROMPT_ENGINEERING_PATH.exists() else ""

PROMPT_VARIANTS = [
    ("base", BASE_SYSTEM_PROMPT),
    ("strict", (BASE_SYSTEM_PROMPT + "\n" + PROMPT_ENGINEERING).strip()),
]

ANSWER_INSTRUCTION = "Return only IOS XR telemetry configuration. No markdown. No explanation."


In [4]:
# Cell 4 (REPLACE) — PEFT adapter loading + generation (with caching)

import json
from typing import Any, Dict, Tuple
import torch

from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# cache: adapter_id -> (tokenizer, model, base_id)
_MODEL_CACHE: Dict[str, Tuple[Any, Any, str]] = {}

def read_adapter_config(adapter_id: str) -> Dict[str, Any]:
    p = hf_hub_download(repo_id=adapter_id, filename="adapter_config.json")
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def load_hf_adapter_model(
    adapter_id: str,
    *,
    use_4bit: bool = False,
) -> Tuple[Any, Any, str]:
    """
    Loads a PEFT adapter repo by:
      1) reading adapter_config.json for base_model_name_or_path
      2) loading the base model
      3) attaching the adapter via PeftModel.from_pretrained
    """
    if adapter_id in _MODEL_CACHE:
        return _MODEL_CACHE[adapter_id]

    acfg = read_adapter_config(adapter_id)
    base_id = acfg["base_model_name_or_path"]

    # Prefer adapter tokenizer (often includes custom special tokens / chat template files).
    tok = AutoTokenizer.from_pretrained(adapter_id, use_fast=True)

    # If adapter tokenizer lacks chat template but base has it, borrow it.
    if not getattr(tok, "chat_template", None):
        base_tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)
        if getattr(base_tok, "chat_template", None):
            tok.chat_template = base_tok.chat_template

    base_kwargs = dict(
        device_map="auto",
        torch_dtype="auto",
    )
    if use_4bit:
        # Requires bitsandbytes
        base_kwargs["load_in_4bit"] = True

    base = AutoModelForCausalLM.from_pretrained(base_id, **base_kwargs)
    model = PeftModel.from_pretrained(base, adapter_id)  # attach LoRA
    model.eval()

    # Safety: ensure pad token exists
    if tok.pad_token_id is None and tok.eos_token_id is not None:
        tok.pad_token = tok.eos_token

    _MODEL_CACHE[adapter_id] = (tok, model, base_id)
    return tok, model, base_id


def format_prompt(tokenizer, system_prompt: str, user_prompt: str) -> str:
    sys_txt = (system_prompt or "").strip()
    usr_txt = user_prompt.strip()

    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
        msgs = []
        if sys_txt:
            msgs.append({"role": "system", "content": sys_txt})
        msgs.append({"role": "user", "content": usr_txt})
        return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

    # Fallback
    if sys_txt:
        return f"SYSTEM:\n{sys_txt}\n\nUSER:\n{usr_txt}\n\nASSISTANT:\n"
    return f"USER:\n{usr_txt}\n\nASSISTANT:\n"


def generation_kwargs_from_temp(temp: float) -> Dict[str, Any]:
    if temp <= 0.0:
        return dict(do_sample=False)
    return dict(do_sample=True, temperature=float(temp), top_p=0.95)


@torch.inference_mode()
def hf_predict_one(
    *,
    model_id: str,            # <- THIS IS YOUR ADAPTER ID (pesimachete/...)
    prompt: str,
    system_prompt: str,
    temperature: float,
    max_new_tokens: int = 512,
    use_4bit: bool = False,   # <- applies to BASE model load
) -> str:
    tok, model, base_id = load_hf_adapter_model(model_id, use_4bit=use_4bit)

    full_user = (prompt.strip() + "\n\n" + ANSWER_INSTRUCTION).strip()
    text = format_prompt(tok, system_prompt=system_prompt, user_prompt=full_user)

    inputs = tok(text, return_tensors="pt")
    dev = next(model.parameters()).device
    inputs = {k: v.to(dev) for k, v in inputs.items()}

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.eos_token_id,
        **generation_kwargs_from_temp(temperature),
    )

    out = model.generate(**inputs, **gen_kwargs)
    decoded = tok.decode(out[0], skip_special_tokens=True)

    # strip prompt echo if present
    if decoded.startswith(text):
        decoded = decoded[len(text):]

    return decoded.strip()


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Cell 4 — Hugging Face model loading + generation (with caching)

from dataclasses import dataclass
from typing import Optional, Tuple, Any, Dict
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

# Cache to avoid re-loading per run
_MODEL_CACHE: Dict[str, Tuple[Any, Any]] = {}

def load_hf_model(
    model_id: str,
    *,
    use_4bit: bool = False,
) -> Tuple[Any, Any]:
    """
    Loads (tokenizer, model). Uses a global cache so you can sweep quickly.
    If you want 4-bit quantization for big models, set use_4bit=True (requires bitsandbytes).
    """
    if model_id in _MODEL_CACHE:
        return _MODEL_CACHE[model_id]

    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)

    model_kwargs = dict(
        torch_dtype="auto",
        device_map="auto",
    )
    if use_4bit:
        # requires bitsandbytes installed
        model_kwargs.update(dict(load_in_4bit=True))

    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
    model.eval()

    _MODEL_CACHE[model_id] = (tok, model)
    return tok, model


def format_prompt(tokenizer, system_prompt: str, user_prompt: str) -> str:
    """
    Prefer chat template if available; otherwise fall back to a simple structured prompt.
    """
    sys_txt = (system_prompt or "").strip()
    usr_txt = user_prompt.strip()

    # Use tokenizer chat template if the model defines one
    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
        messages = []
        if sys_txt:
            messages.append({"role": "system", "content": sys_txt})
        messages.append({"role": "user", "content": usr_txt})
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Fallback prompt format (works “ok” for many instruction-ish models)
    if sys_txt:
        return f"SYSTEM:\n{sys_txt}\n\nUSER:\n{usr_txt}\n\nASSISTANT:\n"
    return f"USER:\n{usr_txt}\n\nASSISTANT:\n"


def generation_kwargs_from_temp(temp: float) -> Dict[str, Any]:
    """
    temp=0.0 => deterministic greedy
    temp>0   => sampling
    """
    if temp <= 0.0:
        return dict(do_sample=False)
    return dict(do_sample=True, temperature=float(temp), top_p=0.95)


@torch.inference_mode()
def hf_predict_one(
    *,
    model_id: str,
    prompt: str,
    system_prompt: str,
    temperature: float,
    max_new_tokens: int = 512,
    use_4bit: bool = False,
) -> str:
    tok, model = load_hf_model(model_id, use_4bit=use_4bit)

    full_user = (prompt.strip() + "\n\n" + ANSWER_INSTRUCTION).strip()
    text = format_prompt(tok, system_prompt=system_prompt, user_prompt=full_user)

    inputs = tok(text, return_tensors="pt")
    # move inputs to same device as model (device_map='auto' uses first param's device)
    dev = next(model.parameters()).device
    inputs = {k: v.to(dev) for k, v in inputs.items()}

    gen_kwargs = {
        "max_new_tokens": max_new_tokens,
        "eos_token_id": tok.eos_token_id,
        "pad_token_id": tok.eos_token_id,
        **generation_kwargs_from_temp(temperature),
    }

    out = model.generate(**inputs, **gen_kwargs)
    decoded = tok.decode(out[0], skip_special_tokens=True)

    # crude but effective: strip the prompt prefix if it’s echoed
    if decoded.startswith(text):
        decoded = decoded[len(text):]

    return decoded.strip()


In [6]:
# Cell 5 — eval data builder (same shape as your RAG one)

from typing import Optional, List, Dict, Any

def build_eval_data_hf(
    *,
    dataset_rows: List[Dict[str, Any]],
    model_id: str,
    system_prompt: str,
    temperature: float,
    max_examples: Optional[int] = None,
    max_new_tokens: int = 512,
    use_4bit: bool = False,
) -> List[Dict[str, Any]]:
    n = len(dataset_rows) if max_examples is None else min(len(dataset_rows), max_examples)
    eval_data: List[Dict[str, Any]] = []

    for i in range(n):
        prompt = dataset_rows[i]["prompt"]
        reference = dataset_rows[i]["completion"]

        candidate = hf_predict_one(
            model_id=model_id,
            prompt=prompt,
            system_prompt=system_prompt,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            use_4bit=use_4bit,
        )

        eval_data.append({
            "inputs": {"prompt": prompt},
            "outputs": candidate,
            "expectations": {"expected_response": reference},
        })

    return eval_data


In [7]:
# Cell 6 — traces -> per-example dataframe (reuse your proven helper)

import pandas as pd
import json
from typing import Any, Dict, List

def _maybe_json(x: Any) -> Any:
    if isinstance(x, str):
        s = x.strip()
        if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
            try:
                return json.loads(s)
            except Exception:
                return x
    return x

def traces_to_eval_df(traces: Any, judge_name: str) -> pd.DataFrame:
    tdf = traces if isinstance(traces, pd.DataFrame) else pd.DataFrame(traces)
    out_rows: List[Dict[str, Any]] = []

    for _, r in tdf.iterrows():
        trace_id = r.get("trace_id")

        req = _maybe_json(r.get("request"))
        resp = _maybe_json(r.get("response"))
        exps = _maybe_json(r.get("expectations"))
        assessments = _maybe_json(r.get("assessments"))

        prompt = None
        if isinstance(req, dict):
            prompt = req.get("prompt") or (req.get("inputs") or {}).get("prompt")

        expected = None
        if isinstance(exps, dict):
            expected = exps.get("expected_response") or exps.get("reference") or exps.get("expected")
        elif isinstance(exps, str):
            expected = exps

        if expected is None and isinstance(req, dict):
            exp_obj = req.get("expectations")
            if isinstance(exp_obj, dict):
                expected = exp_obj.get("expected_response") or exp_obj.get("reference") or exp_obj.get("expected")

        if expected is None:
            expected = r.get("expected_response")

        candidate = None
        if isinstance(resp, str):
            candidate = resp
        elif isinstance(resp, dict):
            candidate = resp.get("outputs") or resp.get("response") or resp.get("output") or resp.get("text")
            if isinstance(candidate, dict):
                candidate = candidate.get("response") or candidate.get("text") or str(candidate)

        score = None
        rationale = None
        if isinstance(assessments, list):
            for a in assessments:
                if not isinstance(a, dict):
                    continue
                name = a.get("assessment_name") or a.get("name")
                if name == judge_name:
                    fb = a.get("feedback") or {}
                    score = fb.get("value")
                    rationale = a.get("rationale") or a.get("explanation")
                    break

        out_rows.append({
            "trace_id": trace_id,
            "prompt": prompt,
            "expected": expected,
            "candidate": candidate,
            "score": score,
            "rationale": rationale,
        })

    df = pd.DataFrame(out_rows, columns=["trace_id","prompt","expected","candidate","score","rationale"])
    df["score"] = pd.to_numeric(df["score"], errors="coerce")
    return df


In [8]:
# Cell 7 — run one HF config in MLflow (atomic)

import tempfile
from pathlib import Path
import mlflow

def run_one_hf_cfg_mlflow(
    *,
    model_id: str,
    temperature: float,
    system_prompt_name: str,
    system_prompt: str,
    dataset_rows: List[Dict[str, Any]],
    max_examples: int = 50,
    max_new_tokens: int = 512,
    use_4bit: bool = False,
):
    mlflow.log_params({
        "hf_model_id": model_id,
        "temperature": float(temperature),
        "system_prompt_variant": system_prompt_name,
        "max_new_tokens": int(max_new_tokens),
        "use_4bit": bool(use_4bit),
    })

    eval_data = build_eval_data_hf(
        dataset_rows=dataset_rows,
        model_id=model_id,
        system_prompt=system_prompt,
        temperature=temperature,
        max_examples=max_examples,
        max_new_tokens=max_new_tokens,
        use_4bit=use_4bit,
    )

    results = mlflow.genai.evaluate(data=eval_data, scorers=[gt_judge])

    eval_run_id = results.run_id
    mlflow.set_tag("eval_run_id", eval_run_id)

    traces = mlflow.search_traces(run_id=eval_run_id)
    df = traces_to_eval_df(traces, judge_name=JUDGE_NAME)

    # metrics
    if (not df.empty) and df["score"].notna().any():
        mlflow.log_metric("judge_mean", float(df["score"].mean()))
        mlflow.log_metric("judge_min", float(df["score"].min()))
        mlflow.log_metric("judge_pass_rate_ge_0.8", float((df["score"] >= 0.8).mean()))
        mlflow.log_metric("n_examples", int(df["score"].notna().sum()))
    else:
        mlflow.log_metric("judge_mean", 0.0)
        mlflow.log_metric("n_examples", int(len(df)))

    # artifacts
    with tempfile.TemporaryDirectory() as td:
        p = Path(td) / "per_example_eval.csv"
        df.to_csv(p, index=False)
        mlflow.log_artifact(str(p))
        if hasattr(mlflow, "log_table"):
            mlflow.log_table(df, "per_example_eval_table.json")

    return df, results


In [9]:
# Cell 8 — sweep the two HF models × temps × system prompts

import random

HF_MODELS = [
    "pesimachete/cisco-iosxr-telemetry-model_3B"
    # "pesimachete/cisco-iosxr-telemetry-model_8B",
]
TEMPS = [0.0]

# Quick smoke test sample (optional)
random.seed(42)
dataset_small = random.sample(dataset_rows, 1)

# For 8B: if you might be tight on VRAM, set use_4bit=True (and install bitsandbytes)
USE_4BIT_FOR_8B = False

with mlflow.start_run(run_name="parent_sweep_hf") as parent:
    for (pname, psys) in PROMPT_VARIANTS:
        for model_id in HF_MODELS:
            for t in TEMPS:
                use_4bit = (USE_4BIT_FOR_8B and model_id.endswith("_8B"))

                run_name = f"{pname}|{model_id.split('/')[-1]}|t={t}|4bit={use_4bit}"
                with mlflow.start_run(run_name=run_name, nested=True):
                    df, _ = run_one_hf_cfg_mlflow(
                        model_id=model_id,
                        temperature=t,
                        system_prompt_name=pname,
                        system_prompt=psys,
                        dataset_rows=dataset_small,   # change to dataset_rows when ready
                        max_examples=50,              # keep aligned with dataset_rows passed
                        max_new_tokens=512,
                        use_4bit=use_4bit,
                    )
                    print("DONE:", run_name, "mean=", df["score"].mean())


`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


KeyboardInterrupt: 

In [None]:
# Cell 9 — collect per-example results from all child runs into one CSV (like your RAG version)

import pandas as pd
import mlflow

exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
runs_df = mlflow.search_runs(experiment_ids=[exp.experiment_id], output_format="pandas")
child = runs_df[runs_df["tags.mlflow.runName"] != "parent_sweep_hf"].copy()

all_dfs = []
for _, rr in child.iterrows():
    eval_run_id = rr.get("tags.eval_run_id")
    if pd.isna(eval_run_id) or eval_run_id is None:
        eval_run_id = rr["run_id"]

    traces = mlflow.search_traces(run_id=str(eval_run_id))
    df = traces_to_eval_df(traces, judge_name=JUDGE_NAME)

    df["run_id"] = rr["run_id"]
    df["run_name"] = rr.get("tags.mlflow.runName")
    df["hf_model_id"] = rr.get("params.hf_model_id")
    df["temperature"] = rr.get("params.temperature")
    df["system_prompt_variant"] = rr.get("params.system_prompt_variant")
    df["use_4bit"] = rr.get("params.use_4bit")

    all_dfs.append(df)

per_example_all = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
per_example_all.to_csv("per_example_all_hf_runs_from_traces.csv", index=False)
print("Wrote per_example_all_hf_runs_from_traces.csv rows:", len(per_example_all))


Wrote per_example_all_hf_runs_from_traces.csv rows: 0
