In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# Simple model with two layers
model = nn.Sequential(
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1)
)

optimizer = optim.SGD(model.parameters(), lr=0.1)
some_input = torch.randn(1, 10)
ideal_output = torch.tensor([[1.0]])

# --- BEFORE training ---
print("Before any backward:")
print(model[2].weight.grad)  # Expect None

# --- PROPER TRAINING LOOP ---
for epoch in range(5):
    # 1️⃣ Always zero out previous gradients before new backward()
    optimizer.zero_grad(set_to_none=True)

    # 2️⃣ Forward pass
    prediction = model(some_input)

    # 3️⃣ Compute loss
    loss = (ideal_output - prediction).pow(2).sum()

    # 4️⃣ Backward pass (compute gradients)
    loss.backward()

    # 5️⃣ (Optional) Inspect gradient for the first layer2 weight
    print(f"Epoch {epoch}: grad[0][0:5] =",
          model[2].weight.grad[0][0:5].detach())

    # 6️⃣ Update model parameters
    optimizer.step()

# --- AFTER training ---
print("\nAfter final optimizer.step():")
print(model[2].weight.grad[0][0:5])  # Grad still there (from last backward)

# If you want to clear again (for next training round)
optimizer.zero_grad(set_to_none=True)
print("\nAfter final zero_grad:")
print(model[2].weight.grad)  # Should print None


Before any backward:
None
Epoch 0: grad[0][0:5] = tensor([-0.2345,  0.0000, -0.1838, -0.1284,  0.0000])
Epoch 1: grad[0][0:5] = tensor([-0.2053,  0.0000, -0.2430,  0.0000,  0.0000])
Epoch 2: grad[0][0:5] = tensor([-0.0653,  0.0000, -0.0802,  0.0000,  0.0000])
Epoch 3: grad[0][0:5] = tensor([-0.0094,  0.0000, -0.0117,  0.0000,  0.0000])
Epoch 4: grad[0][0:5] = tensor([-0.0010,  0.0000, -0.0012,  0.0000,  0.0000])

After final optimizer.step():
tensor([-0.0010,  0.0000, -0.0012,  0.0000,  0.0000])

After final zero_grad:
None
