In [1]:
# Imports
# ---------------------------------------------------------------------------
import pathlib, sys, os
import numpy as np

# ---------------------------------------------------------------------------
# Determine current file/notebook location
# (Use __file__ if running as a script; fallback to cwd for notebooks)
# ---------------------------------------------------------------------------
try:
    HERE = pathlib.Path(__file__).resolve()
except NameError:  # __file__ is not defined in notebooks
    HERE = pathlib.Path().resolve()

# ---------------------------------------------------------------------------
# Locate shared root (the folder that contains both `utils` and your project)
# It climbs up until it finds a `utils/` directory or stops at filesystem root.
# ---------------------------------------------------------------------------
ROOT = HERE
while ROOT != ROOT.parent and not (ROOT / "utils").exists():
    ROOT = ROOT.parent

if not (ROOT / "utils").exists():
    raise RuntimeError(
        f"Could not find 'utils' directory above {HERE}. "
        "Check your project structure or adjust the path resolution logic."
    )

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print(f"✅ Added to sys.path: {ROOT}")

# ---------------------------------------------------------------------------
# Infer project name (parent of pipelines/ or notebooks/)
# e.g., .../projects/<project>/notebooks/... -> <project>
# ---------------------------------------------------------------------------
project_name = HERE.parent.name
os.environ.setdefault("PROJECT", project_name)
print(f"✅ Project name set to: {project_name}")

from utils.plot_eval_runs import load_scores_by_item, compute_score_stats
from utils import load_prompt

✅ Added to sys.path: /Users/theophile/Documents/repos/build-hours/15-reinforcement-fine-tuning
✅ Project name set to: build_hour


In [2]:
# Configuration – edit as needed
# ---------------------------------------------------------------------------
DATASET = "build_hour"  # project / dataset identifier

# Fold the configuration into explicit experiment tuples.

EXPERIMENTS = [
    {"model": "gpt-4.1", "prompt_name": "v7", "reasoning_effort": None},
    {"model": "o4-mini", "prompt_name": "v7", "reasoning_effort": "low"},
    {
        "model": "ft:o4-mini-2025-04-16:agi-final-final-docx:rft-build-hour-prompt-v7:BsZsVoqh:ckpt-step-90",
        "prompt_name": "v7",
        "reasoning_effort": "low",
    },
]

SPLIT = "val"  # "train", "val"


In [3]:
# Compute metrics
# ---------------------------------------------------------------------------
# Cache prompt_name -> prompt_id to avoid re-loading duplicates
_PROMPT_CACHE: dict[str, str] = {}

# Graders to include ---------------------------------------------------------
GRADERS = [
    "level1_precision",
    "level1_recall",
    "level1_f1_python",
]
# ---------------------------------------------------------------------------
# Compute metrics across combinations
# ---------------------------------------------------------------------------

records = []  # list of dicts for each combination

for exp in EXPERIMENTS:
    model = exp["model"]
    pname = exp["prompt_name"]
    r_eff = exp.get("reasoning_effort")

    # Resolve prompt id (cache to minimise disk access)
    if pname not in _PROMPT_CACHE:
        pobj = load_prompt(DATASET, pname, prompt_type="developer")
        if pobj is None:
            raise RuntimeError(f"Prompt {pname} not found – create it under prompts/{DATASET}/")
        _PROMPT_CACHE[pname] = pobj.id
    pid = _PROMPT_CACHE[pname]

    combo_label = f"{model} | prompt={pname} | re={r_eff or 'None'}"
    print(f"\n=== {combo_label} ===")

    found_any = False
    for grader in GRADERS:
        scores_by_item, runs = load_scores_by_item(
            dataset=DATASET,
            prompt_id=pid,
            model=model,
            grader_name=grader,
            reasoning_effort=r_eff,
            split=SPLIT,
        )
        if not scores_by_item:
            continue
        found_any = True

        stats = compute_score_stats(scores_by_item)
        overall_mean = np.mean([v["mean"] for v in stats.values()])
        overall_max = np.mean([v["max"] for v in stats.values()])
        # Compute std across RUN means (variance of experimental repeats)
        run_means = []
        for run_meta in runs:
            vals = []
            for item in run_meta["items"]:
                raw_score = item.get("score")
                if isinstance(raw_score, dict):
                    score_val = raw_score.get(grader)
                else:
                    score_val = raw_score
                if score_val is not None:
                    vals.append(score_val)
            if vals:
                run_means.append(np.mean(vals))
        if len(run_means) > 1:
            overall_std = float(np.std(run_means, ddof=1))
        else:
            overall_std = 0.0

        records.append(
            dict(
                model=model,
                prompt=pname,
                reasoning_effort=r_eff,
                grader=grader,
                mean=overall_mean,
                std=overall_std,
                mean_max=overall_max,
                n_items=len(stats),
                n_runs=len(runs),
            )
        )

    if not found_any:
        print("(no data)")

[prompt] looking for prompts/developer/v7.md
/Users/theophile/Documents/repos/build-hours/15-reinforcement-fine-tuning/projects/build_hour

=== gpt-4.1 | prompt=v7 | re=None ===

=== o4-mini | prompt=v7 | re=low ===

=== ft:o4-mini-2025-04-16:agi-final-final-docx:rft-build-hour-prompt-v7:BsZsVoqh:ckpt-step-90 | prompt=v7 | re=low ===


In [4]:
# Display summary table ------------------------------------------------------
if not records:
    print("No matching runs found – adjust configuration and retry.")
else:
    # Pivot so that each (model, prompt, reasoning) combination has grader metrics as columns
    from collections import defaultdict

    pivot: dict[tuple[str, str, str | None], dict[str, dict]] = defaultdict(dict)
    for rec in records:
        key = (rec["model"], rec["prompt"], rec["reasoning_effort"])
        pivot[key][rec["grader"]] = rec

    # Determine unique graders order
    grader_cols = ["level1_precision", "level1_recall", "level1_f1_python"]
    # Ensure any missing graders also included
    for rec in records:
        if rec["grader"] not in grader_cols:
            grader_cols.append(rec["grader"])

    # Header
    try:
        import pandas as pd  # type: ignore

        rows = []
        for (model, prompt, reff), gdict in pivot.items():
            rec = {"Model": model, "Prompt": prompt, "Reasoning": reff or "None"}
            for g in grader_cols:
                if g in gdict:
                    rec[f"{g}_mean"] = round(gdict[g]["mean"], 3)
                    rec[f"{g}_std"] = round(gdict[g]["std"], 3)
                else:
                    rec[f"{g}_mean"] = None
                    rec[f"{g}_std"] = None
            rows.append(rec)

        df = pd.DataFrame(rows)
        display(df)  # noqa: F821 – works in notebooks
    except ModuleNotFoundError:
        # Fallback to simple tab-separated print
        hdr = ["Model", "Prompt", "Reasoning"]
        for g in grader_cols:
            hdr.extend([f"{g}_mean", f"{g}_std"])
        print("\t".join(hdr))
        for (model, prompt, reff), gdict in pivot.items():
            row = [model, prompt, reff or "None"]
            for g in grader_cols:
                if g in gdict:
                    row.append(f"{gdict[g]['mean']:.3f}")
                    row.append(f"{gdict[g]['std']:.3f}")
                else:
                    row.extend(["-", "-"])
            print("\t".join(row))


Unnamed: 0,Model,Prompt,Reasoning,level1_precision_mean,level1_precision_std,level1_recall_mean,level1_recall_std,level1_f1_python_mean,level1_f1_python_std
0,gpt-4.1,v7,,0.39,0.007,0.593,0.004,0.449,0.006
1,o4-mini,v7,low,0.754,0.018,0.474,0.013,0.537,0.02
2,ft:o4-mini-2025-04-16:agi-final-final-docx:rft...,v7,low,0.774,0.019,0.628,0.016,0.671,0.003


In [5]:
import numpy as np
import plotly.graph_objects as go

# ----------------------------------------------------------
# 1 – Colour palette + Plotly template
# ----------------------------------------------------------

DARK_MODE = True  # set to False for light theme

PLOT_TEMPLATE = "plotly_dark" if DARK_MODE else "plotly_white"
# Pastel palette supplied by user
PASTEL_COLORS = [
    "#e7efff",  # light blue
    "#ffefa4",  # pastel yellow
    "#ffbfa4",  # pastel salmon
    "#ebf5de",  # pastel green
    "#fdedeb",  # pastel pink
]

# ----------------------------------------------------------
# 2 – Plotting
# ----------------------------------------------------------
if "df" in globals():
    METRIC_MAP = {
        "level1_precision_mean": ("Precision", "level1_precision_std"),
        "level1_recall_mean":    ("Recall",    "level1_recall_std"),
        "level1_f1_python_mean": ("F1",        "level1_f1_python_std"),
    }
    metric_names = [v[0] for v in METRIC_MAP.values()]
    metric_cols  = list(METRIC_MAP.keys())

    metric_idx = np.arange(len(metric_cols))  # numeric positions for categories

    # Use the pastel palette in cyclic fashion

    # Treat prompt as part of the variant label so everything appears in one chart
    sub = df.copy()
    sub["Variant"] = sub.apply(
        lambda r: f"{r['Model']} | {r['Prompt']} | {r['Reasoning']}", axis=1
    )

    variants = sub["Variant"].tolist()
    n_var    = len(variants)

    fig = go.Figure()

    bar_width = 0.8 / n_var  # width per bar so they fit in group

    # No need to track max bar now – y-axis fixed to [0,1]

    for idx, variant in enumerate(variants):
        row   = sub[sub["Variant"] == variant].iloc[0]
        means = [row[c] for c in metric_cols]
        errs  = [row[c.replace("_mean", "_std")] for c in metric_cols]

        # Compute x positions with offset
        offset = (idx - (n_var - 1) / 2) * bar_width
        x_pos = metric_idx + offset

        err_col = "#FFA500"  # orange for visibility across themes

        fig.add_bar(
            x=x_pos,
            y=means,
            name=variant,
            marker_color=PASTEL_COLORS[idx % len(PASTEL_COLORS)],
            opacity=1.0,
            error_y=dict(
                type="data",
                array=errs,
                visible=True,
                color=err_col,
                thickness=2,
                width=3,
            ),
            # No direct text here; we'll use a separate scatter for labels
            text=None,
            width=bar_width,
        )

        # Add labels via separate scatter trace, positioning slightly above error bar but capped at 0.98
        label_y = [min(0.98, m + e + 0.005) for m, e in zip(means, errs)]
        fig.add_scatter(
            x=x_pos,
            y=label_y,
            mode="text",
            text=[f"{m:.2f}" for m in means],
            textposition="top center",
            textfont=dict(color=("#FFFFFF" if DARK_MODE else "#000000"), size=12),
            showlegend=False,
            hoverinfo="skip",
        )


    bg_color = "#000000" if DARK_MODE else "#FFFFFF"
    grid_color = "#333333" if DARK_MODE else "rgba(0,0,0,0.08)"

    upper_y = 1.0  # keep axis within [0,1]

    fig.update_layout(
        template=PLOT_TEMPLATE,
        barmode="group",
        bargap=0.25,
        title="Metrics per variant (Model | Prompt | Reasoning)",
        xaxis_title="",
        yaxis_title="Mean score",
        yaxis_range=[0, upper_y],
        yaxis=dict(gridcolor=grid_color),
        xaxis=dict(tickmode="array", tickvals=metric_idx, ticktext=metric_names),
        legend=dict(
            orientation="h",
            y=-0.25,
            x=0.5,
            xanchor="center",
        ),
        margin=dict(l=60, r=60, t=80, b=140),
        height=600,
        width=1000,
        plot_bgcolor=bg_color,
        paper_bgcolor=bg_color,
        font=dict(family="Inter, sans-serif", color="#FFFFFF"),
    )

    fig.show()

else:
    print("Run the summary cell first to generate 'df' before plotting.")


Variances

In [6]:
# Variance 
from utils.plot_eval_runs import plot_score_stats_plotly

for rec in records:

    if "ft" in rec["model"]:
        scores_by_item, runs = load_scores_by_item(
            dataset=DATASET,
            prompt_id=_PROMPT_CACHE[rec["prompt"]],
            model=rec["model"],
            grader_name=rec["grader"],
            reasoning_effort=rec["reasoning_effort"],
            split=SPLIT,
        )
        if not scores_by_item:
            continue
        stats = compute_score_stats(scores_by_item)
        context_label = (
            f"ft-model | {rec['prompt']} | {rec['grader']} | re={rec['reasoning_effort'] or 'None'}"
        )
        _ = plot_score_stats_plotly(stats, n_runs=rec["n_runs"], context=context_label) 
