# 02 — PyTorch training loop (synthetic)

This notebook profiles a small training loop on synthetic data.

It demonstrates a useful pattern:

- profile one **epoch** at a time (stable window)
- log/print the summary per epoch

If PyTorch/CUDA is not available, the training cells will be skipped.

> **PyTorch install:** `pip install torch --index-url https://download.pytorch.org/whl/cu124`
> (works for any CUDA 12.x driver — there is no `cu126` index). Requires Python ≥ 3.9.


In [1]:
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
except Exception as e:
    torch = None
    print("torch not available:", e)

if torch is None or not torch.cuda.is_available():
    print("CUDA not available; skipping training demo.")

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [None]:
if torch is not None and torch.cuda.is_available():
    from profgpu import GpuMonitor

    class SmallMLP(nn.Module):
        def __init__(self, d_in=1024, d_hidden=2048, d_out=10):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(d_in, d_hidden),
                nn.ReLU(),
                nn.Linear(d_hidden, d_out),
            )

        def forward(self, x):
            return self.net(x)

    device = torch.device("cuda")
    model = SmallMLP().to(device)
    opt = optim.AdamW(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    summaries = []
    epochs = 3
    batches_per_epoch = 500  # enough batches so each epoch lasts > 1 s
    batch_size = 256

    for epoch in range(epochs):
        # interval_s=0.05 for fast sampling; warmup_s=0 so short epochs still collect data
        with GpuMonitor(interval_s=0.05, sync_fn=torch.cuda.synchronize, warmup_s=0.0) as mon:
            for _ in range(batches_per_epoch):
                x = torch.randn(batch_size, 1024, device=device)
                y = torch.randint(0, 10, (batch_size,), device=device)

                opt.zero_grad(set_to_none=True)
                logits = model(x)
                loss = loss_fn(logits, y)
                loss.backward()
                opt.step()

        summaries.append(mon.summary)
        print(f"epoch {epoch}\n{mon.summary.format()}\n")

epoch 0
[GPU 0] NVIDIA A10G
  duration: 0.164s | samples: 0 @ 0.200s
  util.gpu: mean n/a | p50 n/a | p95 n/a | max n/a
  util.mem: mean n/a
  power: mean n/a | max n/a
  temp: max n/a
  busy time (est): n/a
  notes: warmup ignored: first 0.20s

epoch 1
[GPU 0] NVIDIA A10G
  duration: 0.172s | samples: 0 @ 0.200s
  util.gpu: mean n/a | p50 n/a | p95 n/a | max n/a
  util.mem: mean n/a
  power: mean n/a | max n/a
  temp: max n/a
  busy time (est): n/a
  notes: warmup ignored: first 0.20s

epoch 2
[GPU 0] NVIDIA A10G
  duration: 0.172s | samples: 0 @ 0.200s
  util.gpu: mean n/a | p50 n/a | p95 n/a | max n/a
  util.mem: mean n/a
  power: mean n/a | max n/a
  temp: max n/a
  busy time (est): n/a
  notes: warmup ignored: first 0.20s



In [4]:
# Optional: turn summaries into a simple table

if "summaries" in globals() and summaries:
    rows = [
        {
            "epoch": i,
            "duration_s": s.duration_s,
            "util_mean": s.util_gpu_mean,
            "util_p95": s.util_gpu_p95,
            "mem_max_mb": s.mem_used_max_mb,
            "power_mean_w": s.power_mean_w,
        }
        for i, s in enumerate(summaries)
    ]

    try:
        import pandas as pd

        df = pd.DataFrame(rows)
        display(df)
    except Exception:
        for r in rows:
            print(r)

{'epoch': 0, 'duration_s': 0.1637926520779729, 'util_mean': nan, 'util_p95': nan, 'mem_max_mb': nan, 'power_mean_w': nan}
{'epoch': 1, 'duration_s': 0.17233654204756021, 'util_mean': nan, 'util_p95': nan, 'mem_max_mb': nan, 'power_mean_w': nan}
{'epoch': 2, 'duration_s': 0.17244787397794425, 'util_mean': nan, 'util_p95': nan, 'mem_max_mb': nan, 'power_mean_w': nan}
