# GPU Performance Tuning (PyTorch)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
import os

# Hardware setup
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))

# Set default device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Enable TF32 for better performance on Ampere GPUs
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Test matrix multiplication performance
A = torch.randn(4096, 4096, device=device, dtype=torch.float32)

%timeit (A @ A).cpu()

PyTorch version: 2.5.1+cu124
CUDA available: True
GPU device: NVIDIA A100-SXM4-40GB
Using device: cuda
54.4 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
n_features = 4096

class Net(nn.Module):
    def __init__(self, dtype=torch.float32):
        super().__init__()
        self.layer1 = nn.Linear(n_features, n_features, dtype=dtype)
        self.layer2 = nn.Linear(n_features, n_features, dtype=dtype)
        self.layer3 = nn.Linear(n_features, n_features, dtype=dtype)
        self.out = nn.Linear(n_features, 2, dtype=dtype)

    def forward(self, x):
        x = F.gelu(self.layer1(x), approximate='tanh')
        x = F.gelu(self.layer2(x), approximate='tanh')
        x = F.gelu(self.layer3(x), approximate='tanh')
        return self.out(x)

In [4]:
# Initialize model
model = Net(dtype=torch.float32).to(device)
model = torch.compile(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [5]:
# Generate synthetic data
B = 32
N = B * 10000

X = torch.randn(N // B, B, n_features, device=device, dtype=torch.float32).to(device)
Y = torch.randint(0, 2, (N // B, B, 1), device=device, dtype=torch.long).to(device)
print(X.shape, Y.shape)
print(X[0,0,0].item(), Y[0,:5,:].cpu().numpy())

torch.Size([10000, 32, 4096]) torch.Size([10000, 32, 1])
2.6152403354644775 [[1]
 [1]
 [0]
 [1]
 [0]]


In [14]:
@torch.compile
def train_step(model, optimizer, X, Y, i):

    optimizer.zero_grad()
    x, y = X[i], Y[i]
    logits = model(x)
    loss = F.cross_entropy(logits.view(-1, 2), y.view(-1))

    loss.backward()
    optimizer.step()

    return loss.item()

In [16]:
import time
from IPython.display import clear_output

num_epochs = 1
torch.cuda.synchronize()  # Ensure GPU is synchronized before timing

avg_iter_time = -1

model.train()
for e in range(num_epochs):
    for i in range(N // B):
        start = time.time()

        loss = train_step(model, optimizer, X, Y, i)
        torch.cuda.synchronize()  # Ensure computation is complete before timing

        if avg_iter_time == -1:
            avg_iter_time = (time.time() - start) * 1000
        else:
            avg_iter_time = (avg_iter_time * i + (time.time() - start) * 1000) / (i + 1)

        print(f"Epoch: {e}, Iter: {i}, Loss: {loss:.4f}, Iter time: {avg_iter_time:.4f} ms")
        clear_output(wait=True)

Epoch: 0, Iter: 9999, Loss: 0.6934, Iter time: 3.3634 ms
