In [1]:
import torch
import torch.nn.functional as F

print("=" * 60)
print("BACKPROPAGATION (Reverse-Mode AD)")
print("=" * 60)

# Simple neural network function
def network(x, W1, W2):
    """Two-layer network: y = W2 @ relu(W1 @ x)"""
    h = F.relu(W1 @ x)
    y = W2 @ h
    return y

# Initialize parameters
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W1 = torch.randn(4, 3, requires_grad=True)
W2 = torch.randn(2, 4, requires_grad=True)

# Forward pass
y = network(x, W1, W2)
print(f"Input x shape: {x.shape}")
print(f"Output y shape: {y.shape}")

# Backpropagation: compute gradients w.r.t. all inputs
# This computes VJPs where v is the gradient from the loss
loss = y.sum()  # Simple loss for demonstration
loss.backward()

print(f"\nGradients via backprop:")
print(f"  dy/dx shape: {x.grad.shape}")
print(f"  dy/dW1 shape: {W1.grad.shape}")
print(f"  dy/dW2 shape: {W2.grad.shape}")

print("\n" + "=" * 60)
print("JACOBIAN-VECTOR PRODUCTS (Forward-Mode AD)")
print("=" * 60)

# Reset gradients
x.grad = None
W1.grad = None
W2.grad = None

# Define tangent vectors (directions for directional derivatives)
v_x = torch.tensor([1.0, 0.0, 0.0])  # Direction in x space
v_W1 = torch.zeros_like(W1)
v_W2 = torch.zeros_like(W2)

print(f"\nTangent vector v_x: {v_x}")

# Forward-mode AD using torch.autograd.functional.jvp
from torch.autograd.functional import jvp

# JVP computes: J(f) @ v where J is the Jacobian
def f(x_input):
    return network(x_input, W1, W2)

output, jvp_result = jvp(f, (x,), (v_x,))

print(f"\nJVP result (directional derivative):")
print(f"  Output shape: {output.shape}")
print(f"  JVP shape: {jvp_result.shape}")
print(f"  JVP value: {jvp_result}")

print("\n" + "=" * 60)
print("KEY DIFFERENCES")
print("=" * 60)

print("""
BACKPROPAGATION (VJP - Vector-Jacobian Product):
  • Reverse-mode automatic differentiation
  • Efficient when outputs << inputs (typical in neural networks)
  • Computes: v^T @ J (gradient vector × Jacobian)
  • One backward pass gives gradients w.r.t. ALL parameters
  • Usage: model.backward(), loss.backward()
  
JVP (Jacobian-Vector Product):
  • Forward-mode automatic differentiation
  • Efficient when inputs << outputs
  • Computes: J @ v (Jacobian × tangent vector)
  • Computes directional derivatives
  • Usage: torch.autograd.functional.jvp()
  • Useful for: Hessian computation, per-example gradients
""")

print("=" * 60)
print("COMPUTATIONAL COST COMPARISON")
print("=" * 60)

n_inputs = 1000
n_outputs = 10

print(f"\nScenario: {n_inputs} inputs → {n_outputs} outputs")
print(f"\nBackprop (VJP):")
print(f"  • Gradients w.r.t. all inputs: 1 backward pass")
print(f"  • Cost: O(cost of forward pass)")

print(f"\nForward-mode (JVP):")
print(f"  • Gradient w.r.t. 1 input direction: 1 forward pass")
print(f"  • Gradients w.r.t. all inputs: {n_inputs} forward passes")
print(f"  • Cost: O({n_inputs} × cost of forward pass)")

print(f"\n→ Backprop is ~{n_inputs}× more efficient for this case!")

print("\n" + "=" * 60)
print("PRACTICAL EXAMPLE: Per-Example Gradients")
print("=" * 60)

# Demonstrate a use case where JVP is useful
batch_size = 3
x_batch = torch.randn(batch_size, 3, requires_grad=True)
W1_new = torch.randn(4, 3, requires_grad=True)
W2_new = torch.randn(2, 4, requires_grad=True)

def batch_network(x_batch):
    # Process batch
    results = []
    for i in range(x_batch.shape[0]):
        results.append(network(x_batch[i], W1_new, W2_new))
    return torch.stack(results)

# Using JVP for directional derivatives per example
print(f"\nBatch size: {batch_size}")
print("Computing per-example directional derivatives using JVP...")

for i in range(batch_size):
    v = torch.zeros_like(x_batch)
    v[i] = torch.ones(3)  # Direction for example i
    
    _, jvp_i = jvp(batch_network, (x_batch,), (v,))
    print(f"  Example {i} JVP: {jvp_i[i]}")

print("\n✓ Demo complete!")

BACKPROPAGATION (Reverse-Mode AD)
Input x shape: torch.Size([3])
Output y shape: torch.Size([2])

Gradients via backprop:
  dy/dx shape: torch.Size([3])
  dy/dW1 shape: torch.Size([4, 3])
  dy/dW2 shape: torch.Size([2, 4])

JACOBIAN-VECTOR PRODUCTS (Forward-Mode AD)

Tangent vector v_x: tensor([1., 0., 0.])

JVP result (directional derivative):
  Output shape: torch.Size([2])
  JVP shape: torch.Size([2])
  JVP value: tensor([ 1.4405, -0.6085])

KEY DIFFERENCES

BACKPROPAGATION (VJP - Vector-Jacobian Product):
  • Reverse-mode automatic differentiation
  • Efficient when outputs << inputs (typical in neural networks)
  • Computes: v^T @ J (gradient vector × Jacobian)
  • One backward pass gives gradients w.r.t. ALL parameters
  • Usage: model.backward(), loss.backward()
  
JVP (Jacobian-Vector Product):
  • Forward-mode automatic differentiation
  • Efficient when inputs << outputs
  • Computes: J @ v (Jacobian × tangent vector)
  • Computes directional derivatives
  • Usage: torch.auto