# Tutorial 04: Matrix Calculus

Essential identities for backpropagation.

In [None]:
import numpy as np
np.random.seed(42)

## Part 1: Scalar-by-Vector Derivatives

In [None]:
def numerical_gradient(f, x, eps=1e-5):
    """Compute gradient numerically using central differences."""
    grad = np.zeros_like(x)
    for i in range(len(x)):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += eps
        x_minus[i] -= eps
        grad[i] = (f(x_plus) - f(x_minus)) / (2 * eps)
    return grad

# Test Identity 1: d/dx(a^T x) = a
a = np.array([1.0, 2.0, 3.0])
x = np.array([0.5, 1.5, 2.5])

f1 = lambda x: np.dot(a, x)
analytical = a
numerical = numerical_gradient(f1, x)

print("Identity 1: ∂/∂x(a^T x) = a")
print(f"  Analytical: {analytical}")
print(f"  Numerical:  {numerical}")
print(f"  Match: {np.allclose(analytical, numerical)}")

In [None]:
# Test Identity 2: d/dx(x^T x) = 2x
f2 = lambda x: np.dot(x, x)
analytical = 2 * x
numerical = numerical_gradient(f2, x)

print("Identity 2: ∂/∂x(x^T x) = 2x")
print(f"  x = {x}")
print(f"  Analytical: {analytical}")
print(f"  Numerical:  {numerical}")
print(f"  Match: {np.allclose(analytical, numerical)}")

In [None]:
# Test Identity 3: d/dx(x^T A x) = (A + A^T)x
A = np.random.randn(3, 3)
f3 = lambda x: x @ A @ x
analytical = (A + A.T) @ x
numerical = numerical_gradient(f3, x)

print("Identity 3: ∂/∂x(x^T A x) = (A + A^T)x")
print(f"  Analytical: {analytical}")
print(f"  Numerical:  {numerical}")
print(f"  Match: {np.allclose(analytical, numerical)}")

In [None]:
# Test Identity 4: d/dx||x - a||^2 = 2(x - a)  [L2 loss gradient!]
f4 = lambda x: np.sum((x - a)**2)
analytical = 2 * (x - a)
numerical = numerical_gradient(f4, x)

print("Identity 4: ∂/∂x ||x - a||² = 2(x - a)  [L2 loss!]")
print(f"  Analytical: {analytical}")
print(f"  Numerical:  {numerical}")
print(f"  Match: {np.allclose(analytical, numerical)}")

## Part 2: Jacobian of Linear Transformation

In [None]:
def numerical_jacobian(f, x, eps=1e-5):
    """Compute Jacobian numerically."""
    y = f(x)
    m, n = len(y), len(x)
    J = np.zeros((m, n))
    
    for j in range(n):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[j] += eps
        x_minus[j] -= eps
        J[:, j] = (f(x_plus) - f(x_minus)) / (2 * eps)
    
    return J

# Test: Jacobian of Ax = A
A = np.array([[1, 2, 3], [4, 5, 6]])
x = np.array([1.0, 2.0, 3.0])

f_linear = lambda x: A @ x
analytical_jacobian = A
numerical_J = numerical_jacobian(f_linear, x)

print("Jacobian of y = Ax is J = A")
print(f"\nA = \n{A}")
print(f"\nNumerical Jacobian = \n{numerical_J}")
print(f"\nMatch: {np.allclose(analytical_jacobian, numerical_J)}")

## Part 3: Backprop Through Linear Layer

In [None]:
class LinearLayer:
    """y = Wx + b"""
    
    def __init__(self, in_features, out_features):
        self.W = np.random.randn(out_features, in_features) * 0.1
        self.b = np.zeros(out_features)
        self.x = None  # Cache for backward
        
    def forward(self, x):
        self.x = x
        return self.W @ x + self.b
    
    def backward(self, grad_output):
        """
        grad_output: dL/dy (gradient from upstream)
        Returns: dL/dx, dL/dW, dL/db
        """
        # dL/dx = W^T @ dL/dy
        grad_x = self.W.T @ grad_output
        
        # dL/dW = dL/dy @ x^T (outer product)
        grad_W = np.outer(grad_output, self.x)
        
        # dL/db = dL/dy
        grad_b = grad_output
        
        return grad_x, grad_W, grad_b

# Test
layer = LinearLayer(3, 2)
x = np.array([1.0, 2.0, 3.0])
y = layer.forward(x)

# Suppose loss gradient w.r.t. y is:
grad_y = np.array([0.5, -0.3])

grad_x, grad_W, grad_b = layer.backward(grad_y)

print("Linear Layer Backprop:")
print(f"  x = {x}")
print(f"  y = Wx + b = {y}")
print(f"  dL/dy = {grad_y}")
print(f"\n  dL/dx = W^T @ dL/dy = {grad_x}")
print(f"  dL/dW = dL/dy ⊗ x = \n{grad_W}")
print(f"  dL/db = dL/dy = {grad_b}")

In [None]:
# Verify with numerical gradients
eps = 1e-5

# Simple loss: L = sum(y)
def compute_loss(W, b, x):
    return np.sum(W @ x + b)

# Numerical gradient for W
numerical_grad_W = np.zeros_like(layer.W)
for i in range(layer.W.shape[0]):
    for j in range(layer.W.shape[1]):
        layer.W[i, j] += eps
        loss_plus = compute_loss(layer.W, layer.b, x)
        layer.W[i, j] -= 2 * eps
        loss_minus = compute_loss(layer.W, layer.b, x)
        layer.W[i, j] += eps  # Restore
        numerical_grad_W[i, j] = (loss_plus - loss_minus) / (2 * eps)

# For L = sum(y), dL/dy = [1, 1]
grad_y_check = np.ones(2)
_, analytical_grad_W, _ = layer.backward(grad_y_check)

print("\nVerification (L = sum(y), so dL/dy = [1,1]):")
print(f"  Analytical dL/dW = \n{analytical_grad_W}")
print(f"  Numerical dL/dW = \n{numerical_grad_W}")
print(f"  Match: {np.allclose(analytical_grad_W, numerical_grad_W)}")

## Part 4: Chain Rule with Matrices

In [None]:
# Two-layer network: y = W2 @ relu(W1 @ x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

class TwoLayerNet:
    def __init__(self, d_in, d_hidden, d_out):
        self.W1 = np.random.randn(d_hidden, d_in) * 0.1
        self.W2 = np.random.randn(d_out, d_hidden) * 0.1
        
    def forward(self, x):
        self.x = x
        self.z1 = self.W1 @ x
        self.a1 = relu(self.z1)
        self.y = self.W2 @ self.a1
        return self.y
    
    def backward(self, grad_y):
        # dL/dW2 = dL/dy @ a1^T
        grad_W2 = np.outer(grad_y, self.a1)
        
        # dL/da1 = W2^T @ dL/dy
        grad_a1 = self.W2.T @ grad_y
        
        # dL/dz1 = dL/da1 * relu'(z1)
        grad_z1 = grad_a1 * relu_derivative(self.z1)
        
        # dL/dW1 = dL/dz1 @ x^T
        grad_W1 = np.outer(grad_z1, self.x)
        
        return grad_W1, grad_W2

# Test
net = TwoLayerNet(4, 3, 2)
x = np.random.randn(4)
y = net.forward(x)

print("Two-layer network: y = W2 @ relu(W1 @ x)")
print(f"\nInput x: {x}")
print(f"Hidden z1 = W1 @ x: {net.z1}")
print(f"Hidden a1 = relu(z1): {net.a1}")
print(f"Output y = W2 @ a1: {y}")

# Backward
grad_y = np.ones(2)  # dL/dy = [1, 1]
grad_W1, grad_W2 = net.backward(grad_y)

print(f"\ndL/dW2 shape: {grad_W2.shape}")
print(f"dL/dW1 shape: {grad_W1.shape}")

In [None]:
# Verify with numerical gradients
eps = 1e-5

def compute_loss_net(W1, W2, x):
    z1 = W1 @ x
    a1 = relu(z1)
    y = W2 @ a1
    return np.sum(y)

# Numerical gradient for W1
numerical_grad_W1 = np.zeros_like(net.W1)
for i in range(net.W1.shape[0]):
    for j in range(net.W1.shape[1]):
        net.W1[i, j] += eps
        loss_plus = compute_loss_net(net.W1, net.W2, x)
        net.W1[i, j] -= 2 * eps
        loss_minus = compute_loss_net(net.W1, net.W2, x)
        net.W1[i, j] += eps
        numerical_grad_W1[i, j] = (loss_plus - loss_minus) / (2 * eps)

print("Verification of dL/dW1:")
print(f"  Analytical:\n{grad_W1}")
print(f"  Numerical:\n{numerical_grad_W1}")
print(f"  Max difference: {np.max(np.abs(grad_W1 - numerical_grad_W1)):.2e}")

## Summary

**Key matrix calculus identities:**
| Expression | Derivative |
|------------|------------|
| $a^T x$ | $a$ |
| $x^T x$ | $2x$ |
| $x^T A x$ | $(A + A^T)x$ |
| $Ax$ | $A$ (Jacobian) |

**Backprop through linear layer:**
- $\frac{\partial L}{\partial x} = W^T \frac{\partial L}{\partial y}$
- $\frac{\partial L}{\partial W} = \frac{\partial L}{\partial y} x^T$
- $\frac{\partial L}{\partial b} = \frac{\partial L}{\partial y}$