# Tutorial 11: Learning Rate

Exploring the most important hyperparameter in deep learning.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
np.random.seed(42)
torch.manual_seed(42)

## Part 1: Visualizing Learning Rate Effect

In [None]:
# Simple 2D optimization problem
def loss_fn(x, y):
    """Rosenbrock-like function with global minimum at (1, 1)"""
    return (1 - x)**2 + 10 * (y - x**2)**2

def grad_fn(x, y):
    """Gradient of loss"""
    dx = -2*(1-x) - 40*x*(y - x**2)
    dy = 20*(y - x**2)
    return np.array([dx, dy])

def gradient_descent(lr, start=(-1, 1), n_steps=100):
    """Run gradient descent and return trajectory"""
    pos = np.array(start, dtype=float)
    trajectory = [pos.copy()]
    
    for _ in range(n_steps):
        grad = grad_fn(pos[0], pos[1])
        pos = pos - lr * grad
        trajectory.append(pos.copy())
        
        # Stop if diverging
        if np.any(np.abs(pos) > 10):
            break
    
    return np.array(trajectory)

# Different learning rates
lrs = [0.001, 0.01, 0.03, 0.05]

# Plot loss landscape
x_range = np.linspace(-2, 2, 100)
y_range = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = loss_fn(X, Y)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, lr in zip(axes.flatten(), lrs):
    # Contour plot
    ax.contour(X, Y, Z, levels=np.logspace(-1, 3, 20), cmap='viridis', alpha=0.7)
    
    # Trajectory
    traj = gradient_descent(lr)
    ax.plot(traj[:, 0], traj[:, 1], 'r.-', linewidth=1, markersize=3)
    ax.plot(traj[0, 0], traj[0, 1], 'go', markersize=10, label='Start')
    ax.plot(1, 1, 'r*', markersize=15, label='Minimum')
    
    final_loss = loss_fn(traj[-1, 0], traj[-1, 1])
    ax.set_title(f'LR = {lr}\nFinal loss: {final_loss:.4f}')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend()

plt.tight_layout()
plt.show()

print("Too small LR: Slow convergence")
print("Too large LR: Oscillation or divergence")
print("Just right: Fast and stable convergence")

## Part 2: Implementing Optimizers from Scratch

In [None]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def step(self, params, grads):
        return params - self.lr * grads

class Momentum:
    def __init__(self, lr=0.01, beta=0.9):
        self.lr = lr
        self.beta = beta
        self.v = None
    
    def step(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)
        self.v = self.beta * self.v + grads
        return params - self.lr * self.v

class RMSprop:
    def __init__(self, lr=0.01, beta=0.9, eps=1e-8):
        self.lr = lr
        self.beta = beta
        self.eps = eps
        self.v = None
    
    def step(self, params, grads):
        if self.v is None:
            self.v = np.zeros_like(params)
        self.v = self.beta * self.v + (1 - self.beta) * grads**2
        return params - self.lr * grads / (np.sqrt(self.v) + self.eps)

class Adam:
    def __init__(self, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = None
        self.v = None
        self.t = 0
    
    def step(self, params, grads):
        if self.m is None:
            self.m = np.zeros_like(params)
            self.v = np.zeros_like(params)
        
        self.t += 1
        self.m = self.beta1 * self.m + (1 - self.beta1) * grads
        self.v = self.beta2 * self.v + (1 - self.beta2) * grads**2
        
        # Bias correction
        m_hat = self.m / (1 - self.beta1**self.t)
        v_hat = self.v / (1 - self.beta2**self.t)
        
        return params - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

# Compare optimizers
def optimize(optimizer, start=(-1, 1), n_steps=100):
    pos = np.array(start, dtype=float)
    trajectory = [pos.copy()]
    
    for _ in range(n_steps):
        grad = grad_fn(pos[0], pos[1])
        pos = optimizer.step(pos, grad)
        trajectory.append(pos.copy())
    
    return np.array(trajectory)

# Run all optimizers
optimizers = {
    'SGD (lr=0.001)': SGD(lr=0.001),
    'Momentum': Momentum(lr=0.001),
    'RMSprop': RMSprop(lr=0.01),
    'Adam': Adam(lr=0.1)
}

fig, ax = plt.subplots(figsize=(10, 8))
ax.contour(X, Y, Z, levels=np.logspace(-1, 3, 20), cmap='viridis', alpha=0.5)

colors = ['blue', 'green', 'red', 'purple']
for (name, opt), color in zip(optimizers.items(), colors):
    traj = optimize(opt)
    ax.plot(traj[:, 0], traj[:, 1], '.-', linewidth=1, markersize=3, 
            color=color, label=name, alpha=0.8)

ax.plot(1, 1, 'r*', markersize=20)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Optimizer Comparison on Rosenbrock-like Function')
ax.legend()
plt.show()

## Part 3: Learning Rate Schedules

In [None]:
epochs = 100
base_lr = 0.1

# Different schedules
def constant(epoch):
    return base_lr

def step_decay(epoch, drop=0.5, epochs_drop=30):
    return base_lr * (drop ** (epoch // epochs_drop))

def exponential_decay(epoch, k=0.05):
    return base_lr * np.exp(-k * epoch)

def cosine_annealing(epoch, T_max=100, eta_min=0.001):
    return eta_min + (base_lr - eta_min) * (1 + np.cos(np.pi * epoch / T_max)) / 2

def warmup_cosine(epoch, warmup=10, T_max=100):
    if epoch < warmup:
        return base_lr * epoch / warmup
    else:
        return cosine_annealing(epoch - warmup, T_max - warmup)

# Plot schedules
epochs_range = np.arange(epochs)

plt.figure(figsize=(12, 5))

schedules = {
    'Constant': [constant(e) for e in epochs_range],
    'Step Decay': [step_decay(e) for e in epochs_range],
    'Exponential': [exponential_decay(e) for e in epochs_range],
    'Cosine': [cosine_annealing(e) for e in epochs_range],
    'Warmup + Cosine': [warmup_cosine(e) for e in epochs_range]
}

for name, lrs in schedules.items():
    plt.plot(epochs_range, lrs, linewidth=2, label=name)

plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedules')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Part 4: Learning Rate Range Test

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Load MNIST
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Simple model
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        return self.fc(x)

def lr_range_test(model, loader, lr_min=1e-7, lr_max=10, num_iters=100):
    """Find optimal learning rate by gradually increasing LR"""
    # Exponential LR increase
    lr_mult = (lr_max / lr_min) ** (1 / num_iters)
    
    optimizer = optim.SGD(model.parameters(), lr=lr_min)
    criterion = nn.CrossEntropyLoss()
    
    lrs = []
    losses = []
    lr = lr_min
    
    model.train()
    data_iter = iter(loader)
    
    for i in range(num_iters):
        try:
            x, y = next(data_iter)
        except StopIteration:
            data_iter = iter(loader)
            x, y = next(data_iter)
        
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        
        lrs.append(lr)
        losses.append(loss.item())
        
        # Increase LR
        lr *= lr_mult
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        # Stop if loss explodes
        if loss.item() > 100:
            break
    
    return lrs, losses

# Run test
model = SimpleNet()
lrs, losses = lr_range_test(model, train_loader)

# Plot
plt.figure(figsize=(10, 5))
plt.plot(lrs, losses)
plt.xscale('log')
plt.xlabel('Learning Rate (log scale)')
plt.ylabel('Loss')
plt.title('Learning Rate Range Test')
plt.axvline(lrs[np.argmin(losses)], color='r', linestyle='--', label=f'Min loss at LR={lrs[np.argmin(losses)]:.4f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Find suggested LR (where loss decreases fastest)
grad = np.gradient(losses)
suggested_idx = np.argmin(grad[:len(grad)//2])  # Before loss starts increasing
print(f"\nSuggested learning rate: {lrs[suggested_idx]:.4f}")
print("(Choose LR where loss decreases steepest, before it diverges)")

## Summary

**Key insights:**
1. **Learning rate** controls step size in parameter space
2. **Too small** → slow convergence, **too large** → divergence
3. **Adaptive optimizers** (Adam, RMSprop) adjust LR per parameter
4. **Schedules** (cosine, warmup) improve final performance
5. **LR range test** helps find good starting LR