# 03 — CUDA async pitfalls

A classic measurement footgun:

- Many frameworks queue GPU work asynchronously.
- A Python function can return quickly even though the GPU keeps working.

This notebook shows:

- naive timing (incorrect)
- corrected timing with `torch.cuda.synchronize()`
- how `gpu-profile`'s `sync_fn` helps

If PyTorch/CUDA is not available, the demo cells will be skipped.


In [None]:
import time

try:
    import torch
except Exception as e:
    torch = None
    print('torch not available:', e)

if torch is None or not torch.cuda.is_available():
    print('CUDA not available; skipping async demo.')


In [None]:
if torch is not None and torch.cuda.is_available():
    n = 8192
    steps = 10
    a = torch.randn(n, n, device='cuda')
    b = torch.randn(n, n, device='cuda')

    # Naive timing: measures enqueue time, not completion time.
    t0 = time.perf_counter()
    for _ in range(steps):
        _ = a @ b
    naive = time.perf_counter() - t0

    # Correct timing: synchronize before stopping the timer.
    t0 = time.perf_counter()
    for _ in range(steps):
        _ = a @ b
    torch.cuda.synchronize()
    correct = time.perf_counter() - t0

    print(f'naive enqueue time:   {naive:.6f}s')
    print(f'correct wall time:   {correct:.6f}s')


In [None]:
# Compare gpu-profile with and without sync_fn

if torch is not None and torch.cuda.is_available():
    from gpu_profile import gpu_profile

    @gpu_profile(interval_s=0.1, report=True)  # no sync
    def without_sync():
        for _ in range(10):
            _ = a @ b

    @gpu_profile(interval_s=0.1, sync_fn=torch.cuda.synchronize, report=True)
    def with_sync():
        for _ in range(10):
            _ = a @ b

    print()
    print('--- without sync_fn ---')
    without_sync()

    print()
    print('--- with sync_fn ---')
    with_sync()
