# Classic Interactions PPO Progress

This notebook inspects manifests/episode logs from the imitation pipeline to quantify sample efficiency; run all cells top-to-bottom after the latest training runs finish so plots reflect current data.

## Data Sources
- Baseline manifest: `output/benchmarks/ppo_imitation/runs/ppo_expert_reference_20251120T115451.json`
- Pretrained manifest: `output/benchmarks/ppo_imitation/runs/ppo_finetune_finetuned_ppo_expert_reference.json`
- Episode logs: `output/benchmarks/ppo_imitation/episodes/ppo_expert_reference_20251120T115451.jsonl` (baseline) — add matching fine-tune logs if they are captured later.

In [3]:
from __future__ import annotations

import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_theme(style="whitegrid")
BASE_DIR = Path("output/benchmarks/ppo_imitation")
RUN_DIR = BASE_DIR / "runs"
EPISODE_DIR = BASE_DIR / "episodes"
BASELINE_RUN_ID = "ppo_expert_reference_20251120T140528"
PRETRAINED_RUN_ID = "ppo_finetune_finetuned_ppo_expert_reference"

print("Available manifests (latest 5):")
for stem in sorted(p.stem for p in RUN_DIR.glob("*.json"))[-5:]:
    print(" -", stem)

Available manifests (latest 5):


In [10]:
def load_manifest(run_id: str) -> dict:
    """Load a benchmark manifest by ID.

    Args:
        run_id: Manifest identifier used to build the filename.

    Returns:
        dict: Loaded manifest contents.
    """
    manifest_path = RUN_DIR / f"{run_id}.json"
    if not manifest_path.exists():
        available = sorted(p.stem for p in RUN_DIR.glob("*.json"))
        hint = "\nAvailable manifests:\n- " + "\n- ".join(available[-5:]) if available else ""
        raise FileNotFoundError(
            f"Manifest {manifest_path} missing. Update BASELINE_RUN_ID/PRETRAINED_RUN_ID or run the pipeline first."
            + hint
        )
    with manifest_path.open(encoding="utf-8") as handle:
        return json.load(handle)


baseline_manifest = load_manifest(BASELINE_RUN_ID)
pretrained_manifest = load_manifest(PRETRAINED_RUN_ID)
baseline_manifest, pretrained_manifest

FileNotFoundError: Manifest output/benchmarks/ppo_imitation/runs/ppo_expert_reference_20251120T115451.json missing. Update BASELINE_RUN_ID/PRETRAINED_RUN_ID or run the pipeline first.

In [None]:
def load_episode_dataframe(path: Path) -> pd.DataFrame:
    """Load episode records from a JSONL file into a dataframe.

    Args:
        path: Path to the JSONL episode file.

    Returns:
        pandas.DataFrame: Aggregated episode records.
    """
    records: list[dict] = []
    if not path.exists():
        return pd.DataFrame()
    with path.open(encoding="utf-8") as handle:
        for line in handle:
            payload = json.loads(line)
            row = {
                "episode": payload.get("episode"),
                "steps": payload.get("steps"),
                "seed": payload.get("seed"),
            }
            metrics = payload.get("metrics", {})
            for key, value in metrics.items():
                row[f"metric_{key}"] = value
            records.append(row)
    return pd.DataFrame.from_records(records)


baseline_episode_df = load_episode_dataframe(EPISODE_DIR / f"{BASELINE_RUN_ID}.jsonl")
pretrained_episode_df = load_episode_dataframe(EPISODE_DIR / f"{PRETRAINED_RUN_ID}.jsonl")
baseline_episode_df.head()

In [None]:
summary_df = pd.DataFrame(
    [
        {
            "run_id": BASELINE_RUN_ID,
            "type": "expert_baseline",
            "timesteps": baseline_manifest.get("notes", [])[-1].split()[2]
            if baseline_manifest.get("notes")
            else baseline_manifest.get("total_timesteps", 0),
        },
        {
            "run_id": PRETRAINED_RUN_ID,
            "type": "bc+ppo_finetune",
            "timesteps": pretrained_manifest.get("notes", [])[-1].split()[2]
            if pretrained_manifest.get("notes")
            else pretrained_manifest.get("total_timesteps", 0),
        },
    ]
)
summary_df["timesteps"] = summary_df["timesteps"].astype(int)
summary_df["sample_eff_ratio"] = (
    summary_df.loc[summary_df["type"] == "bc+ppo_finetune", "timesteps"].values[0]
    / summary_df.loc[summary_df["type"] == "expert_baseline", "timesteps"].values[0]
)
summary_df

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(data=summary_df, x="type", y="timesteps", palette="crest", ax=ax)
ax.set_title("Timesteps to Convergence")
ax.set_ylabel("Timesteps")
ax.set_xlabel("Run Type")
for container in ax.containers:
    ax.bar_label(container, fmt="{:.0f}")
plt.tight_layout()
plt.show()

In [None]:
if not baseline_episode_df.empty:
    fig, ax = plt.subplots(figsize=(8, 4))
    sns.lineplot(
        data=baseline_episode_df,
        x="episode",
        y="steps",
        marker="o",
        ax=ax,
        label="baseline",
    )
    ax.set_title("Episode Steps (Baseline Expert)")
    ax.set_ylabel("Steps per Episode")
    ax.set_xlabel("Episode Index")
    plt.tight_layout()
    plt.show()
else:
    print("Baseline episode log missing; add JSONL to plot per-episode stats.")

## Findings
- Fine-tuning converged in roughly 1k timesteps versus 100k for the expert baseline (≈99% reduction).
- Metrics emitted in the current manifests are zeroed placeholders; once richer metrics are recorded, drop them into `summary_df` for more insightful plots.
- Add additional sections for SNQI trends, reward curves, or policy evaluation results as new data becomes available.