# Tutorial 08: Backpropagation — Implementation from Scratch

This notebook implements backpropagation step by step, verifies against PyTorch, and visualizes the entropy connection.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

## Part 1: Building Blocks — Activation Functions and Derivatives

In [None]:
# Activation functions and their derivatives

def sigmoid(z):
    """σ(z) = 1 / (1 + e^(-z))"""
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_derivative(z):
    """σ'(z) = σ(z)(1 - σ(z))"""
    s = sigmoid(z)
    return s * (1 - s)

def relu(z):
    """ReLU(z) = max(0, z)"""
    return np.maximum(0, z)

def relu_derivative(z):
    """ReLU'(z) = 1 if z > 0, else 0"""
    return (z > 0).astype(float)

def tanh(z):
    return np.tanh(z)

def tanh_derivative(z):
    """tanh'(z) = 1 - tanh²(z)"""
    return 1 - np.tanh(z)**2

def softmax(z):
    """Numerically stable softmax"""
    z_shifted = z - np.max(z, axis=0, keepdims=True)
    exp_z = np.exp(z_shifted)
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

In [None]:
# Visualize activation functions and their derivatives
z = np.linspace(-5, 5, 100)

fig, axes = plt.subplots(2, 3, figsize=(14, 8))

# Sigmoid
axes[0, 0].plot(z, sigmoid(z), 'b-', linewidth=2)
axes[0, 0].set_title('Sigmoid')
axes[0, 0].grid(True, alpha=0.3)

axes[1, 0].plot(z, sigmoid_derivative(z), 'r-', linewidth=2)
axes[1, 0].set_title('Sigmoid Derivative')
axes[1, 0].axhline(0.25, color='gray', linestyle='--', label='max=0.25')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# ReLU
axes[0, 1].plot(z, relu(z), 'b-', linewidth=2)
axes[0, 1].set_title('ReLU')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 1].plot(z, relu_derivative(z), 'r-', linewidth=2)
axes[1, 1].set_title('ReLU Derivative')
axes[1, 1].grid(True, alpha=0.3)

# Tanh
axes[0, 2].plot(z, tanh(z), 'b-', linewidth=2)
axes[0, 2].set_title('Tanh')
axes[0, 2].grid(True, alpha=0.3)

axes[1, 2].plot(z, tanh_derivative(z), 'r-', linewidth=2)
axes[1, 2].set_title('Tanh Derivative')
axes[1, 2].axhline(1, color='gray', linestyle='--', label='max=1')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Activation Functions and Their Derivatives', y=1.02, fontsize=14)
plt.show()

print("Key insight: Sigmoid derivative is maximized at z=0 (maximum uncertainty)")
print("This is the ENTROPY connection - uncertain outputs have high gradient flow!")

## Part 2: Manual Backprop for a Single Neuron

In [None]:
# Single neuron: z = w·x + b, a = σ(z), L = (a - y)²

# Forward pass
x = np.array([1.0, 2.0, 3.0])  # Input
w = np.array([0.5, -0.5, 0.2])  # Weights
b = 0.1  # Bias
y = 0.8  # Target

z = np.dot(w, x) + b
a = sigmoid(z)
L = (a - y) ** 2

print("=== Forward Pass ===")
print(f"z = w·x + b = {z:.4f}")
print(f"a = σ(z) = {a:.4f}")
print(f"L = (a - y)² = {L:.4f}")

# Backward pass (manually applying chain rule)
# dL/da = 2(a - y)
dL_da = 2 * (a - y)

# da/dz = σ'(z) = σ(z)(1 - σ(z))
da_dz = sigmoid_derivative(z)

# dL/dz = dL/da · da/dz
dL_dz = dL_da * da_dz  # This is δ (delta)

# dz/dw = x, dz/db = 1
dL_dw = dL_dz * x
dL_db = dL_dz * 1

print("\n=== Backward Pass ===")
print(f"dL/da = 2(a-y) = {dL_da:.4f}")
print(f"da/dz = σ'(z) = {da_dz:.4f}")
print(f"δ = dL/dz = {dL_dz:.4f}")
print(f"dL/dw = {dL_dw}")
print(f"dL/db = {dL_db:.4f}")

In [None]:
# Verify with numerical gradient
def numerical_gradient(f, x, eps=1e-5):
    """Compute numerical gradient using central differences"""
    grad = np.zeros_like(x)
    for i in range(len(x)):
        x_plus = x.copy()
        x_plus[i] += eps
        x_minus = x.copy()
        x_minus[i] -= eps
        grad[i] = (f(x_plus) - f(x_minus)) / (2 * eps)
    return grad

# Define loss as function of w
def loss_fn(w_test):
    z_test = np.dot(w_test, x) + b
    a_test = sigmoid(z_test)
    return (a_test - y) ** 2

numerical_dw = numerical_gradient(loss_fn, w)

print("=== Gradient Verification ===")
print(f"Analytical dL/dw: {dL_dw}")
print(f"Numerical dL/dw:  {numerical_dw}")
print(f"Difference: {np.abs(dL_dw - numerical_dw).max():.2e}")

## Part 3: Full Neural Network from Scratch

In [None]:
class NeuralNetwork:
    """
    A neural network implemented from scratch with backpropagation.
    Architecture: Input → Hidden (ReLU) → Output (Softmax)
    Loss: Cross-Entropy
    """
    
    def __init__(self, layer_sizes):
        """
        layer_sizes: list of ints, e.g., [784, 128, 10]
        """
        self.num_layers = len(layer_sizes)
        self.layer_sizes = layer_sizes
        
        # Initialize weights (He initialization)
        self.weights = []
        self.biases = []
        
        for i in range(len(layer_sizes) - 1):
            w = np.random.randn(layer_sizes[i+1], layer_sizes[i]) * np.sqrt(2 / layer_sizes[i])
            b = np.zeros((layer_sizes[i+1], 1))
            self.weights.append(w)
            self.biases.append(b)
    
    def forward(self, X):
        """
        Forward pass. Store intermediate values for backprop.
        X: input, shape (n_features, n_samples)
        """
        self.activations = [X]  # a[0] = input
        self.z_values = []  # pre-activation values
        
        a = X
        for i in range(len(self.weights) - 1):
            z = self.weights[i] @ a + self.biases[i]
            self.z_values.append(z)
            a = relu(z)  # Hidden layers use ReLU
            self.activations.append(a)
        
        # Output layer (softmax)
        z = self.weights[-1] @ a + self.biases[-1]
        self.z_values.append(z)
        a = softmax(z)
        self.activations.append(a)
        
        return a
    
    def cross_entropy_loss(self, y_pred, y_true):
        """
        Cross-entropy loss: L = -Σ y_true * log(y_pred)
        y_true: one-hot encoded, shape (n_classes, n_samples)
        """
        eps = 1e-10
        m = y_true.shape[1]
        return -np.sum(y_true * np.log(y_pred + eps)) / m
    
    def backward(self, y_true):
        """
        Backward pass. Compute gradients using backpropagation.
        """
        m = y_true.shape[1]  # Number of samples
        
        self.dW = []
        self.db = []
        
        # Output layer: softmax + cross-entropy has simple gradient
        # dL/dz = y_pred - y_true
        dz = self.activations[-1] - y_true  # Shape: (n_classes, m)
        
        # Backpropagate through layers
        for i in range(len(self.weights) - 1, -1, -1):
            # Gradient for weights and biases
            dW = (1/m) * dz @ self.activations[i].T
            db = (1/m) * np.sum(dz, axis=1, keepdims=True)
            
            self.dW.insert(0, dW)
            self.db.insert(0, db)
            
            if i > 0:  # Don't need to compute for input layer
                # Propagate error to previous layer
                da = self.weights[i].T @ dz
                # Apply activation derivative (ReLU for hidden layers)
                dz = da * relu_derivative(self.z_values[i-1])
    
    def update(self, learning_rate):
        """Update weights using computed gradients"""
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * self.dW[i]
            self.biases[i] -= learning_rate * self.db[i]
    
    def train_step(self, X, y, learning_rate):
        """One training step: forward, backward, update"""
        y_pred = self.forward(X)
        loss = self.cross_entropy_loss(y_pred, y)
        self.backward(y)
        self.update(learning_rate)
        return loss
    
    def predict(self, X):
        """Predict class labels"""
        y_pred = self.forward(X)
        return np.argmax(y_pred, axis=0)

## Part 4: Train on XOR Problem

In [None]:
# XOR problem - not linearly separable!
X_xor = np.array([[0, 0, 1, 1],
                  [0, 1, 0, 1]])  # Shape: (2, 4)

y_xor = np.array([[1, 0, 0, 1],   # Class 0: XOR = 0
                  [0, 1, 1, 0]])  # Class 1: XOR = 1 (one-hot)

# Create network: 2 inputs → 4 hidden → 2 outputs
nn = NeuralNetwork([2, 4, 2])

# Train
losses = []
for epoch in range(5000):
    loss = nn.train_step(X_xor, y_xor, learning_rate=0.5)
    losses.append(loss)
    if epoch % 1000 == 0:
        preds = nn.predict(X_xor)
        acc = np.mean(preds == np.argmax(y_xor, axis=0))
        print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {acc:.2%}")

# Final predictions
print("\n=== Final Predictions ===")
for i in range(4):
    pred = nn.predict(X_xor[:, i:i+1])[0]
    true = np.argmax(y_xor[:, i])
    print(f"Input: {X_xor[:, i]} → Predicted: {pred}, True: {true}")

In [None]:
# Visualize training
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Cross-Entropy Loss')
plt.title('Training Loss (XOR Problem)')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

## Part 5: Verify Against PyTorch

In [None]:
import torch
import torch.nn as nn

# Create identical network in PyTorch
class PyTorchNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 4)
        self.fc2 = nn.Linear(4, 2)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Copy weights from our implementation
torch_nn = PyTorchNN()
with torch.no_grad():
    torch_nn.fc1.weight.copy_(torch.tensor(nn.weights[0], dtype=torch.float32))
    torch_nn.fc1.bias.copy_(torch.tensor(nn.biases[0].flatten(), dtype=torch.float32))
    torch_nn.fc2.weight.copy_(torch.tensor(nn.weights[1], dtype=torch.float32))
    torch_nn.fc2.bias.copy_(torch.tensor(nn.biases[1].flatten(), dtype=torch.float32))

# Forward pass comparison
X_torch = torch.tensor(X_xor.T, dtype=torch.float32)  # PyTorch: (batch, features)
y_torch = torch.tensor(np.argmax(y_xor, axis=0), dtype=torch.long)

# Our implementation
our_output = nn.forward(X_xor)

# PyTorch
torch_output = torch.softmax(torch_nn(X_torch), dim=1)

print("=== Output Comparison ===")
print(f"Our implementation:\n{our_output.T}")
print(f"\nPyTorch:\n{torch_output.detach().numpy()}")
print(f"\nMax difference: {np.abs(our_output.T - torch_output.detach().numpy()).max():.2e}")

In [None]:
# Gradient comparison
criterion = nn.CrossEntropyLoss()
torch_logits = torch_nn(X_torch)
loss_torch = criterion(torch_logits, y_torch)
loss_torch.backward()

# Our gradients
nn.forward(X_xor)
nn.backward(y_xor)

print("=== Gradient Comparison (Layer 1 Weights) ===")
print(f"Our dW1:\n{nn.dW[0]}")
print(f"\nPyTorch dW1:\n{torch_nn.fc1.weight.grad.numpy()}")
print(f"\nMax difference: {np.abs(nn.dW[0] - torch_nn.fc1.weight.grad.numpy()).max():.2e}")

## Part 6: The Entropy Connection — Visualized

In [None]:
# Demonstrate: Cross-entropy loss = entropy + KL divergence

def entropy(p):
    """Shannon entropy: H(p) = -Σ p log p"""
    p = np.clip(p, 1e-10, 1)
    return -np.sum(p * np.log(p))

def cross_entropy(p, q):
    """Cross-entropy: H(p, q) = -Σ p log q"""
    q = np.clip(q, 1e-10, 1)
    return -np.sum(p * np.log(q))

def kl_divergence(p, q):
    """KL divergence: D_KL(p || q) = Σ p log(p/q)"""
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    return np.sum(p * np.log(p / q))

# Example: true distribution vs predicted
p_true = np.array([1.0, 0.0, 0.0])  # One-hot (class 0)

# Different predicted distributions
predictions = [
    np.array([0.9, 0.05, 0.05]),   # Good prediction
    np.array([0.6, 0.2, 0.2]),     # Medium
    np.array([0.33, 0.33, 0.34]),  # Bad (near uniform)
    np.array([0.1, 0.8, 0.1]),     # Wrong class
]

print("=== Cross-Entropy Decomposition ===")
print(f"True distribution: {p_true}")
print(f"H(true) = {entropy(p_true):.4f} (always 0 for one-hot)\n")

for q in predictions:
    H_true = entropy(p_true)
    H_cross = cross_entropy(p_true, q)
    D_KL = kl_divergence(p_true, q)
    
    print(f"Predicted: {q}")
    print(f"  Cross-entropy H(p,q) = {H_cross:.4f}")
    print(f"  = H(p) + D_KL(p||q) = {H_true:.4f} + {D_KL:.4f} = {H_true + D_KL:.4f}")
    print()

In [None]:
# Visualize: Gradient magnitude vs entropy of prediction

# For softmax + cross-entropy, gradient = y_pred - y_true
# The magnitude depends on how "wrong" the prediction is

p_range = np.linspace(0.01, 0.99, 100)

# True label is class 0
y_true_binary = np.array([1, 0])

gradients = []
entropies = []
losses = []

for p in p_range:
    y_pred = np.array([p, 1-p])
    
    # Gradient: y_pred - y_true
    grad = y_pred - y_true_binary
    grad_magnitude = np.linalg.norm(grad)
    
    # Entropy of prediction
    H = entropy(y_pred)
    
    # Cross-entropy loss
    L = cross_entropy(y_true_binary, y_pred)
    
    gradients.append(grad_magnitude)
    entropies.append(H)
    losses.append(L)

fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].plot(p_range, losses, 'b-', linewidth=2)
axes[0].set_xlabel('P(class 0)')
axes[0].set_ylabel('Cross-Entropy Loss')
axes[0].set_title('Loss vs Prediction Confidence')
axes[0].axvline(1, color='g', linestyle='--', label='True class')
axes[0].grid(True, alpha=0.3)

axes[1].plot(p_range, gradients, 'r-', linewidth=2)
axes[1].set_xlabel('P(class 0)')
axes[1].set_ylabel('Gradient Magnitude |∂L/∂z|')
axes[1].set_title('Gradient vs Prediction')
axes[1].grid(True, alpha=0.3)

axes[2].plot(p_range, entropies, 'purple', linewidth=2)
axes[2].set_xlabel('P(class 0)')
axes[2].set_ylabel('Entropy H(y_pred)')
axes[2].set_title('Prediction Entropy (Uncertainty)')
axes[2].axhline(np.log(2), color='gray', linestyle='--', label='Max entropy')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key insight: Large gradients when prediction is WRONG (far from true class)")
print("Maximum uncertainty (entropy) is at p=0.5 — the network is most 'confused'")

In [None]:
# Final visualization: Gradient flow and entropy

# For sigmoid activation: σ'(z) = σ(z)(1 - σ(z))
# This is maximized at z=0 where σ(z) = 0.5 (maximum entropy!)

z = np.linspace(-6, 6, 100)
sigmoid_out = sigmoid(z)
sigmoid_grad = sigmoid_derivative(z)

# Entropy of sigmoid output (treating it as Bernoulli probability)
def binary_entropy(p):
    p = np.clip(p, 1e-10, 1-1e-10)
    return -p * np.log(p) - (1-p) * np.log(1-p)

entropy_sigmoid = binary_entropy(sigmoid_out)

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(z, sigmoid_out, 'b-', linewidth=2, label='σ(z) - Output')
ax.plot(z, sigmoid_grad, 'r-', linewidth=2, label="σ'(z) - Gradient")
ax.plot(z, entropy_sigmoid / np.log(2), 'g-', linewidth=2, label='H(σ(z)) / ln(2) - Entropy')

ax.axvline(0, color='gray', linestyle='--', alpha=0.5)
ax.axhline(0.5, color='gray', linestyle=':', alpha=0.5)

ax.set_xlabel('Pre-activation z', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.set_title('The Entropy-Gradient Connection in Sigmoid Activation', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Annotate
ax.annotate('Max gradient & entropy\nat z=0', xy=(0, 0.25), xytext=(2, 0.4),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=11)

plt.show()

print("\n" + "="*60)
print("THE FUNDAMENTAL CONNECTION:")
print("="*60)
print("• High entropy (uncertainty) → High gradient flow")
print("• Low entropy (saturation) → Vanishing gradients")
print("• Backpropagation naturally learns through uncertain neurons!")
print("="*60)

## Summary

**What we learned:**
1. Backpropagation is repeated application of the chain rule
2. Gradients flow backward, computing ∂L/∂θ for all parameters
3. Cross-entropy loss connects directly to information theory
4. **The entropy connection**: Neurons with high uncertainty (entropy) have high gradient flow

**The deep insight**: Training neural networks is fundamentally about reducing the "surprise" of predictions — minimizing cross-entropy IS minimizing information-theoretic divergence!