In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

import datetime
import os
import pathlib
from typing import Any, Dict, List
import sys
from dotenv import load_dotenv

from openai import AsyncOpenAI
from openai.types.graders import PythonGraderParam, ScoreModelGrader, MultiGraderParam

In [None]:
HERE = pathlib.Path().resolve()

# ---------------------------------------------------------------------------
# Locate shared root (the folder that contains both `utils` and your project)
# It climbs up until it finds a `utils/` directory or stops at filesystem root.
# ---------------------------------------------------------------------------
ROOT = HERE
while ROOT != ROOT.parent and not (ROOT / "utils").exists():
    ROOT = ROOT.parent

if not (ROOT / "utils").exists():
    raise RuntimeError(
        f"Could not find 'utils' directory above {HERE}. "
        "Check your project structure or adjust the path resolution logic."
    )

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print(f"✅ Added to sys.path: {ROOT}")

# ---------------------------------------------------------------------------
# Infer project name (parent of pipelines/ or notebooks/)
# e.g., .../projects/<project>/notebooks/... -> <project>
# ---------------------------------------------------------------------------
project_name = HERE.parent.name
os.environ.setdefault("PROJECT", project_name)
print(f"✅ Project name set to: {project_name}")

# ---------------------------------------------------------------------------
# Load project-specific environment variables
# ---------------------------------------------------------------------------
env_path = HERE.parent / ".env"
if env_path.exists():
    load_dotenv(env_path, override=True)
    print(f"✅ Loaded .env from: {env_path}")
else:
    print("⚠️ No .env file found, relying on existing environment variables.")

# ---------------------------------------------------------------------------
# Ensure the OpenAI API key is available
# ---------------------------------------------------------------------------
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError(
        "OPENAI_API_KEY not found. Add it to your .env file or export it before running."
    )
print("✅ OpenAI API key detected.")

In [None]:
# ---------------------------------------------------------------------------
# Project Imports (now should work everywhere)
# ---------------------------------------------------------------------------
from utils import (
    infer_item_schema,
    build_data_source,
    wait_until_finished,
    fetch_all_output_items,
    extract_items,
    save_run,
    save_grader,
    RunRecord,
    load_prompt,
    get_or_upload_file,
)
from utils.project_paths import datasets_root, project_root
from utils.plot_eval_runs import (
    load_scores_by_item,
    compute_score_stats,
    plot_score_stats,
    plot_score_stats_plotly,
)

# Ensure structured_outputs can be imported
_cust_root = project_root()
if str(_cust_root) not in sys.path:
    sys.path.append(str(_cust_root))


print("✅ Utils imported successfully.")


In [None]:
# Configuration
# ---------------------------------------------------------------------------
# Dataset auto-discovery: pick first *_{SPLIT}.jsonl file under `data/`
# ---------------------------------------------------------------------------

# Select data split ---------------------------------------------------------
SPLIT = "train"  # choose "train", "val", "test" etc.

try:
    DATA_PATH = next(datasets_root().glob(f"*_{SPLIT}.jsonl"))
except StopIteration as e:
    raise FileNotFoundError("No *_train.jsonl dataset found in data/ folder") from e

DATASET_NAME = project_name  # use folder name as dataset identifier


In [None]:
# Response JSON schema -------------------------------------------------------
from openai.lib._pydantic import to_strict_json_schema 
from structured_outputs.base_models import Level1Codes

schema = to_strict_json_schema(Level1Codes)
RESPONSE_FORMAT: Dict[str, Any] = {
    "type": "json_schema",
    "name": Level1Codes.__name__,
    "schema": schema,
    "strict": True,
}
example = Level1Codes(level1=[
    {"code": "environment"},
    {"code": "agriculture, forestry and fisheries"}
])

from pprint import pprint
print("Example Level1Codes output:\n" + "-"*40)
pprint(example.model_dump(), sort_dicts=False)
print("-"*40)

In [None]:
# Grader --------------------------------------------------------------------
# ---------------------------------------------------------------------------
# Precision, Recall, and F1 graders (F1 both as Python & Multi)

# -- Grader source strings --------------------------------------------------

precision_grader_source = """
def grade(sample, item) -> float:
    pred_list = (sample or {}).get("output_json", {}).get("level1", [])
    if not isinstance(pred_list, list):
        return 0.0
    pred_set = {d.get("code") for d in pred_list if isinstance(d, dict) and d.get("code")}

    ref_set = set(item["reference_answer"])
    if not pred_set:
        return 0.0
    inter = len(pred_set & ref_set)
    return inter / len(pred_set)
"""

recall_grader_source = """
def grade(sample, item) -> float:
    pred_list = (sample or {}).get("output_json", {}).get("level1", [])
    if not isinstance(pred_list, list):
        return 0.0
    pred_set = {d.get("code") for d in pred_list if isinstance(d, dict) and d.get("code")}

    ref_set = set(item["reference_answer"])
    if not ref_set:
        return 0.0
    inter = len(pred_set & ref_set)
    return inter / len(ref_set)
"""

f1_grader_source = """
def grade(sample, item) -> float:
    output = sample.get("output_json")
    if not output or not isinstance(output, dict):
        return 0.0

    pred_list = output.get("level1", [])
    if not isinstance(pred_list, list):
        return 0.0

    try:
        pred_set = {d["code"] for d in pred_list if isinstance(d, dict) and "code" in d}
    except Exception:
        return 0.0

    ref_set = set(item["reference_answer"])
    if not pred_set or not ref_set:
        return 0.0

    inter = len(pred_set & ref_set)
    precision = inter / len(pred_set)
    recall = inter / len(ref_set)
    denom = precision + recall
    return (2 * precision * recall / denom) if denom else 0.0
"""

In [None]:
# -- Instantiate graders ----------------------------------------------------

precision_grader = PythonGraderParam(
    type="python",
    name="level1_precision",
    source=precision_grader_source,
    image_tag="2025-05-08",
    pass_threshold=0.8
)

recall_grader = PythonGraderParam(
    type="python",
    name="level1_recall",
    source=recall_grader_source,
    image_tag="2025-05-08",
    pass_threshold=0.8
)

f1_python_grader = PythonGraderParam(
    type="python",
    name="level1_f1_python",
    source=f1_grader_source,
    image_tag="2025-05-08",
    pass_threshold=0.8
)

# MultiGrader uses max() to avoid division by zero
f1_multi_grader = MultiGraderParam(
    type="multi",
    name="level1_f1_multi",
    graders={
        "precision": precision_grader,
        "recall": recall_grader,
    },
    calculate_output="2 * precision * recall / max(precision + recall, 1e-9)",
)

In [None]:
# Persist grader definitions
for g in (precision_grader, recall_grader, f1_python_grader, f1_multi_grader):
    print('Saving', g["name"])
    save_grader(g)

# Collect all graders for the eval – keep F1 multi first so it appears as the primary result
GRADERS = [f1_python_grader, precision_grader, recall_grader]

In [None]:
# Prompt
PROMPT_NAME = "v7"
prompt_obj = load_prompt(DATASET_NAME, PROMPT_NAME, prompt_type="developer")
if prompt_obj is None:
    raise RuntimeError(f"Prompt {PROMPT_NAME} not found – create it under prompts/{DATASET_NAME}/")
prompt = prompt_obj

In [None]:
# Upload dataset & create eval
client = AsyncOpenAI(
    api_key=api_key,
    project=os.getenv("OPENAI_PROJECT_ID"),
)
file_id = await get_or_upload_file(client, DATA_PATH)
item_schema = infer_item_schema(DATA_PATH)

eval_obj = await client.evals.create(
    name=f"law-codes-{prompt.name}",
    metadata={"description": f"Live eval – {prompt.name}"},
    data_source_config={
        "type": "custom",
        "item_schema": item_schema,
        "include_sample_schema": True,
    },
    testing_criteria=GRADERS,
)
eval_id = eval_obj.id
print("Eval created:", eval_id)

In [None]:
MODEL_NAME = "o4-mini"
MODEL_PARAMS: Dict[str, Any] = {
    # Standard response params
    "seed": 42,
    "temperature": None,
    "top_p": None,
    "max_completions_tokens": None,
    "text": {"format": RESPONSE_FORMAT},  # or None to disable JSON mode
    # Reasoning-specific params (responses models)
    "reasoning_effort": "low", #"medium",  # set to None or "low"/"medium"/"high"
    # Tools / function calling
    "tools": None,
}
# Remove keys with explicit None so we don't send them to the API
MODEL_PARAMS = {k: v for k, v in MODEL_PARAMS.items() if v is not None}

In [None]:
# Build data_source config
USER_FIELD = "text_input"
data_source = build_data_source(
    prompt,
    file_id,
    USER_FIELD,
    model=MODEL_NAME,
    model_params=MODEL_PARAMS,
    datasource_type="responses",
)

In [None]:
# Run loop
N_RUNS = 1
for i in range(N_RUNS):
    print(f"\n=== Run {i + 1}/{N_RUNS} ===")
    run = await client.evals.runs.create(
        eval_id=eval_id,
        name=f"variance-{prompt.name}-run{i + 1}",
        data_source=data_source,
    )
    print("Run URL:", getattr(run, "report_url", "<no url>"))

    run_fin = await wait_until_finished(client, eval_id, run.id)
    if run_fin.status == "failed":
        print("Run failed. Error details:")
        print(getattr(run_fin, "error", "<no error field>"))
        print(getattr(run_fin, "error_message", "<no error_message field>"))
        continue
    # If run succeeded, proceed to fetch outputs
    items_raw = await fetch_all_output_items(client, eval_id, run.id)
    items = extract_items(items_raw)

    # Quick peek ---------------------------------------------------------
    if items_raw:
        content_preview = items_raw[0].sample.output[0].content if items_raw[0].sample and items_raw[0].sample.output else "<no output>"
        print("First assistant output:", content_preview)

    # Save run metadata once with all graders
    record = RunRecord(
        dataset=DATASET_NAME,
        prompt=vars(prompt),
        eval_id=eval_id,
        run_id=run.id,
        model=MODEL_NAME,
        grader_name=None,  # deprecated – use grader_names instead
        grader_names=[g["name"] if isinstance(g, dict) else getattr(g, "name", str(g)) for g in GRADERS],
        timestamp=datetime.datetime.now(datetime.timezone.utc).isoformat(),
        reasoning_effort=MODEL_PARAMS.get("reasoning_effort", None),
        split=SPLIT,
        items=items,
    )
    save_run(record)

    # Accuracy per-grader using the new score dict structure
    for grader in GRADERS:
        scores: List[float] = [
            it["score"].get(grader["name"] if isinstance(grader, dict) else getattr(grader, "name", None))
            for it in items
            if it["score"].get(grader["name"] if isinstance(grader, dict) else getattr(grader, "name", None)) is not None
        ]
        if scores:
            gname = grader["name"] if isinstance(grader, dict) else getattr(grader, "name", "<unknown>")
            print(f"{gname} Accuracy = {sum(scores)/len(scores):.3f}")


In [None]:
# %%
for grader in GRADERS:
    scores_by_item, runs = load_scores_by_item(
        DATASET_NAME,
        prompt_id=prompt.id,
        model=MODEL_NAME,
        grader_name=grader["name"] if isinstance(grader, dict) else getattr(grader, "name", None),
        reasoning_effort=MODEL_PARAMS.get("reasoning_effort", None),
        split=SPLIT,
    )

    stats = compute_score_stats(scores_by_item)
    gname = grader["name"] if isinstance(grader, dict) else getattr(grader, "name", "<unknown>")
    print(f"Plotting results for {gname} – {len(runs)} runs")
    plot_score_stats_plotly(stats, n_runs=len(runs), context=gname, width=1000, height=600, dark_mode=False)