# maths-prompt: Progress Dashboard

Visualise prompt optimisation progress across sessions.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

LOGS_DIR = Path("../logs")
EVAL_LOG = LOGS_DIR / "evaluations.jsonl"
TEST_LOG = LOGS_DIR / "test_results.jsonl"

def load_jsonl(path):
    if not path.exists():
        return pd.DataFrame()
    records = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return pd.DataFrame(records)

df_train = load_jsonl(EVAL_LOG)
df_test = load_jsonl(TEST_LOG)
print(f"Training evaluations: {len(df_train)}")
print(f"Test evaluations: {len(df_test)}")

In [None]:
# Accuracy over iterations
if not df_train.empty:
    fig, ax = plt.subplots(figsize=(12, 5))
    for session_id, group in df_train.groupby("session"):
        ax.plot(group["iteration"], group["accuracy"], marker="o", label=f"Session {session_id}")
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Accuracy")
    ax.set_title("Training Accuracy Over Iterations")
    ax.set_ylim(0, 1)
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No training data yet.")

In [None]:
# Prompt log â€” each attempt with its score
if not df_train.empty:
    prompt_log = df_train[["iteration", "session", "accuracy", "prompt"]].copy()
    prompt_log["prompt_preview"] = prompt_log["prompt"].str[:100] + "..."
    display(prompt_log[["iteration", "session", "accuracy", "prompt_preview"]])
else:
    print("No training data yet.")

In [None]:
# Test vs Training accuracy comparison
if not df_train.empty:
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(df_train["iteration"], df_train["accuracy"], marker="o", label="Training", alpha=0.7)
    if not df_test.empty:
        test_x = range(1, len(df_test) + 1)
        ax.scatter(test_x, df_test["accuracy"], marker="s", s=100, c="red", label="Test", zorder=5)
    ax.set_xlabel("Iteration / Eval")
    ax.set_ylabel("Accuracy")
    ax.set_title("Training vs Test Accuracy")
    ax.set_ylim(0, 1)
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No data yet.")

In [None]:
# Category breakdown heatmap (from test results)
if not df_test.empty and "problems" in df_test.columns:
    rows = []
    for idx, row in df_test.iterrows():
        for p in row["problems"]:
            rows.append({"eval": idx + 1, "category": p["category"], "correct": p["correct"]})
    df_cat = pd.DataFrame(rows)
    pivot = df_cat.groupby(["eval", "category"])["correct"].mean().unstack(fill_value=0)
    fig, ax = plt.subplots(figsize=(10, 4))
    im = ax.imshow(pivot.values.T, aspect="auto", cmap="RdYlGn", vmin=0, vmax=1)
    ax.set_xticks(range(len(pivot.index)))
    ax.set_xticklabels(pivot.index)
    ax.set_yticks(range(len(pivot.columns)))
    ax.set_yticklabels(pivot.columns)
    ax.set_xlabel("Test Evaluation")
    ax.set_title("Accuracy by Category")
    plt.colorbar(im, ax=ax)
    plt.tight_layout()
    plt.show()
else:
    print("No test data with category breakdown yet.")

In [None]:
# Best prompt found
if not df_train.empty:
    best_idx = df_train["accuracy"].idxmax()
    best = df_train.loc[best_idx]
    print(f"Best accuracy: {best['accuracy']:.1%} (iteration {best['iteration']}, session {best['session']})")
    print(f"\nPrompt:\n{'='*60}")
    print(best["prompt"])
    print("=" * 60)
else:
    print("No training data yet.")