In [None]:
from pathlib import Path
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np

In [None]:
def load_scores_by_item_index(jsonl_path: Path) -> Dict[int, List[float]]:
    scores_by_item_index: Dict[int, List[float]] = defaultdict(list)
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            # Required fields in baseline result lines
            item_index = obj.get("item_index")
            score = obj.get("score")
            if item_index is None or score is None:
                continue
            try:
                item_index = int(item_index)
                score = float(score)
                if score == -1.0:
                    score = 0.0
            except (TypeError, ValueError):
                continue
            if np.isnan(score) or np.isinf(score):
                continue
            scores_by_item_index[item_index].append(score)
    return scores_by_item_index

In [None]:
def compute_stats(scores_by_item_index: Dict[int, List[float]]) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    item_indices_sorted = sorted(scores_by_item_index.keys())
    means: List[float] = []
    stds: List[float] = []
    maxes: List[float] = []
    for idx in item_indices_sorted:
        scores = np.asarray(scores_by_item_index[idx], dtype=float)
        if scores.size == 0:
            means.append(np.nan)
            stds.append(np.nan)
            maxes.append(np.nan)
            continue
        means.append(float(np.mean(scores)))
        stds.append(float(np.std(scores)))
        maxes.append(float(np.max(scores)))
    return (
        np.asarray(item_indices_sorted, dtype=int),
        np.asarray(means, dtype=float),
        np.asarray(stds, dtype=float),
        np.asarray(maxes, dtype=float),
    )

In [None]:
def sort_by_mean(item_indices: np.ndarray, means: np.ndarray, stds: np.ndarray, maxes: np.ndarray):
    order = np.argsort(means)
    return item_indices[order], means[order], stds[order], maxes[order]

In [None]:
def plot_stats(
    means: np.ndarray,
    stds: np.ndarray,
    maxes: np.ndarray,
    *,
    title: str = "Mean ± Std (bars) and Max per Sample (sorted)",
    output_path: Path,
):
    x = np.arange(len(means))
    fig, ax = plt.subplots(figsize=(12, 6))

    # Bars for means
    ax.bar(x, means, color="#cfe7ff", edgecolor="#9ec5fe", linewidth=1.0, alpha=0.7, label="Mean")

    # Error bars for ±1 std
    ax.vlines(x, means - stds, means + stds, color="#2f4b7c", linewidth=2.0, label="± 1 σ")

    # Max markers
    ax.scatter(x, maxes, marker="x", color="#d62728", s=30, linewidths=1.5, label="Max")

    ax.set_title(title)
    ax.set_xlabel("Sample (sorted by mean)")
    ax.set_ylabel("Score")
    ax.set_ylim(0.0, 1.02)
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    ax.legend(loc="upper left")

    fig.tight_layout()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_path, dpi=160)
    plt.show()
    plt.close(fig)

In [None]:
def create_variance_plot(results_file_path, output_plot_path):
    scores_by_item = load_scores_by_item_index(results_file_path)
    if not scores_by_item:
        raise SystemExit("No (item_index, score) pairs found in input.")

    item_idx, means, stds, maxes = compute_stats(scores_by_item)
    _item_idx_s, means_s, stds_s, maxes_s = sort_by_mean(item_idx, means, stds, maxes)

    # Title derived simply from filename: take the token after the last underscore
    stem = results_file_path.stem
    variant = stem.split("_")[-1]
    plot_title = f"Reward variance plot over 3 repeats (variant: {variant})"

    plot_stats(means_s, stds_s, maxes_s, title=plot_title, output_path=output_plot_path)
    print(f"Saved plot to: {output_plot_path}")

In [None]:
file_name = "rft_eval_results_baseline.jsonl"
results_file_path = Path(f"/Users/theophile/Documents/repos/build-hours/20-agent-rft/build_hour/finqa_model_text/tool_evals/{file_name}")
output_plot_path = Path(f"/Users/theophile/Documents/repos/build-hours/20-agent-rft/build_hour/finqa_model_text/tool_evals/{file_name.replace('.jsonl', '_plot.png')}")
create_variance_plot(results_file_path, output_plot_path)

In [None]:
file_name = "rft_eval_results_step10.jsonl"
results_file_path = Path(f"/Users/theophile/Documents/repos/build-hours/20-agent-rft/build_hour/finqa_model_text/tool_evals/{file_name}")
output_plot_path = Path(f"/Users/theophile/Documents/repos/build-hours/20-agent-rft/build_hour/finqa_model_text/tool_evals/{file_name.replace('.jsonl', '_plot.png')}")
create_variance_plot(results_file_path, output_plot_path)