# ML Systems Demo: Benchmarking & Visualization

Benchmark the Transformer model (forward/backward timings, optional mixed precision) and visualize results.

**Run from:** `project-3-systems/` (kernel cwd = repo root).

In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd()
if not (ROOT / "cs336_systems").exists() and (ROOT.parent / "cs336_systems").exists():
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT))

import torch
import matplotlib.pyplot as plt
import pandas as pd

from cs336_systems.benchmark import benchmark_model

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
    print("CUDA not available. Benchmark cells will be skipped (benchmark uses torch.cuda.synchronize).")
print("Device:", device)

## 1. Benchmark: forward and forward+backward

In [None]:
configs = ["small", "medium"]
seq_lens = [128, 256, 512] if device == "cuda" else [128, 256]
batch_size = 4
n_warmup, n_steps = 3, 10

rows = []
if device != "cuda":
    print("Skipping benchmark (requires CUDA).")
else:
    for config_name in configs:
        for seq_len in seq_lens:
            try:
                r_fwd = benchmark_model(config_name, batch_size, seq_len, n_warmup, n_steps, "forward", False, device)
                r_bwd = benchmark_model(config_name, batch_size, seq_len, n_warmup, n_steps, "forward_backward", False, device)
                rows.append({
                    "config": config_name,
                    "seq_len": seq_len,
                    "forward_ms": r_fwd["mean_ms"],
                    "forward_std": r_fwd["std_ms"],
                    "fwd_bwd_ms": r_bwd["mean_ms"],
                    "fwd_bwd_std": r_bwd["std_ms"],
                    "tokens_per_sec": r_bwd.get("tokens_per_sec", 0),
                })
            except Exception as e:
                rows.append({"config": config_name, "seq_len": seq_len, "error": str(e)})

df = pd.DataFrame(rows)
if len(df) > 0:
    display(df)
else:
    df = pd.DataFrame(columns=["config", "seq_len", "forward_ms", "fwd_bwd_ms"])

## 2. Visualization: timing by config and sequence length

In [None]:
if "error" not in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    for config_name in configs:
        sub = df[df["config"] == config_name]
        axes[0].errorbar(sub["seq_len"], sub["forward_ms"], yerr=sub["forward_std"], label=config_name, marker="o")
        axes[1].errorbar(sub["seq_len"], sub["fwd_bwd_ms"], yerr=sub["fwd_bwd_std"], label=config_name, marker="s")
    axes[0].set_xlabel("Sequence length")
    axes[0].set_ylabel("Time (ms)")
    axes[0].set_title("Forward pass")
    axes[0].legend()
    axes[1].set_xlabel("Sequence length")
    axes[1].set_ylabel("Time (ms)")
    axes[1].set_title("Forward + Backward")
    axes[1].legend()
    plt.tight_layout()
    plt.show()
else:
    print("Skipping plot (errors in benchmark).")

## 4. Warmup vs no-warmup (CS336 A2)

Compare mean forward+backward time with 0 warmup vs a few warmup steps to show effect of CUDA warmup.

In [None]:
if device == "cuda":
    w0 = benchmark_model("small", 4, 256, 0, 10, "forward_backward", False, device)
    w3 = benchmark_model("small", 4, 256, 3, 10, "forward_backward", False, device)
    print("No warmup (0): mean_ms =", w0["mean_ms"], "std_ms =", w0["std_ms"])
    print("Warmup (3):   mean_ms =", w3["mean_ms"], "std_ms =", w3["std_ms"])
else:
    print("Skipping (CPU). Run on CUDA for warmup comparison.")

## CS336 Assignment 2 alignment

This notebook runs: **benchmark script** (forward / forward+backward), **warmup vs no-warmup**, **mixed precision (FP32 vs BF16)**, and **plots**. It does **not** include Nsight reports, memory profiling scripts, or written answers.

- **Full checklist:** [docs/CS336_ASSIGNMENT2_CHECKLIST.md](../docs/CS336_ASSIGNMENT2_CHECKLIST.md) — maps every A2 problem to this codebase and to writeup deliverables.
- **Written / external:** Nsight Compute/Systems, memory profiling (record + snapshot + memory_viz), attention-only benchmark, torch.compile comparison, DDP scaling, optimizer sharding table — see checklist and complete in writeup/scripts as needed.

## 3. Mixed precision (BF16) comparison (CUDA only)

In [None]:
if device == "cuda":
    comp = []
    for use_bf16 in [False, True]:
        r = benchmark_model("small", 4, 256, 3, 10, "forward_backward", use_bf16, device)
        comp.append({"mixed_precision": "BF16" if use_bf16 else "FP32", "mean_ms": r["mean_ms"], "mem_mb": r.get("peak_mem_mb", 0)})
    comp_df = pd.DataFrame(comp)
    display(comp_df)
    plt.bar(comp_df["mixed_precision"], comp_df["mean_ms"], color=["steelblue", "coral"])
    plt.ylabel("Time (ms)")
    plt.title("Forward+Backward: FP32 vs BF16")
    plt.show()
else:
    print("Skipping (CPU). Run on CUDA for mixed-precision comparison.")