In [3]:
import torch
import torch.nn as nn
from torch.amp import autocast, GradScaler

In [9]:
model = nn.Sequential(
    nn.Linear(10, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

**Mixed Precision Training**

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.0001)
scaler = GradScaler()

for epoch in range(100):
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()

        # Forward pass with autocast
        with autocast():
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            
        # Backward pass with scaler
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

**Gradient Accumulation**

In [None]:
accumulation_steps = 4
optimizer.zero_grad()

for i, (batch_x, batch_y) in enumerate(train_loader):
    outputs = model(batch_x)
    loss = criterion(outputs, batch_y) / accumulation_steps
    loss.backward()
    
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

**Learning Rate Scheduling**

In [None]:
# Cosine annealing
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

In [None]:
# One-cycle
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,
    steps_per_epoch=1000,
    epochs= 100
)

for epoch in range(100):
    for batch in train_loader:
        # ... training code ...
        scheduler.step() # Update learning rate