# 01 — Decorator quickstart

This notebook demonstrates:

- wrapping a function with `@gpu_profile`
- the importance of `sync_fn` for CUDA-async frameworks (PyTorch)
- capturing the summary programmatically

If PyTorch/CUDA is not available, the GPU cells will be skipped.

> **PyTorch install:** PyTorch is **not** bundled with `profgpu`.
> Install it separately with the correct CUDA index:
> ```
> pip install torch --index-url https://download.pytorch.org/whl/cu124
> ```
> Use `cu124` for any CUDA 12.x driver. There is no `cu126` index.
> Requires Python ≥ 3.9. See [installation docs](../docs/installation.md) for details.


In [1]:
from profgpu import gpu_profile


# The decorator prints a report by default.
@gpu_profile(interval_s=0.2, strict=False)
def do_work():
    # Replace this with real GPU work in your codebase.
    import time

    time.sleep(2)


do_work()

[GPU 0] NVIDIA A10G
  duration: 2.000s | samples: 10 @ 0.200s
  util.gpu: mean 0.0% | p50 0.0% | p95 0.0% | max 0.0%
  util.mem: mean 0.0%
  memory: max used 517 MB / total 23028 MB
  power: mean 15.6 W | max 15.6 W
  temp: max 22 °C
  busy time (est): 0.000s
  util trace: ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


In [3]:
# PyTorch example (optional)

try:
    import torch
except Exception as e:
    torch = None
    print("torch not available:", e)

if torch is None or not torch.cuda.is_available():
    print("CUDA not available; skipping PyTorch demo.")
else:
    from profgpu import gpu_profile

    @gpu_profile(interval_s=0.1, sync_fn=torch.cuda.synchronize, warmup_s=0.2)
    def matmul_bench(n=4096, steps=30):
        a = torch.randn(n, n, device="cuda")
        b = torch.randn(n, n, device="cuda")
        for _ in range(steps):
            _ = a @ b

    matmul_bench()

  cpu = _conversion_method_template(device=torch.device("cpu"))


[GPU 0] NVIDIA A10G
  duration: 0.260s | samples: 1 @ 0.100s
  util.gpu: mean 2.0% | p50 2.0% | p95 2.0% | max 2.0%
  util.mem: mean 0.0%
  memory: max used 997 MB / total 23028 MB
  power: mean 47.9 W | max 47.9 W
  temp: max 28 °C
  busy time (est): 0.001s
  util trace: ████████████████████████████████████████
  notes: warmup ignored: first 0.20s


In [4]:
# Get results back as a structured object

from profgpu import gpu_profile


@gpu_profile(report=False, return_profile=True, interval_s=0.2, strict=False)
def work_and_return_value():
    import time

    time.sleep(1)
    return {"ok": True}


res = work_and_return_value()
print("value:", res.value)
print("util mean:", res.gpu.util_gpu_mean)
print("p95:", res.gpu.util_gpu_p95)

value: {'ok': True}
util mean: 0.0
p95: 0.0
