# Tutorial 09: Regularization — Preventing Overfitting

This notebook demonstrates regularization techniques: L1, L2, Dropout, and their effects on training.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
np.random.seed(42)
torch.manual_seed(42)

## Part 1: Create a Dataset That's Easy to Overfit

In [None]:
# Generate small, noisy dataset
X, y = make_moons(n_samples=200, noise=0.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert to PyTorch
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.FloatTensor(y_train).unsqueeze(1)
X_test_t = torch.FloatTensor(X_test)
y_test_t = torch.FloatTensor(y_test).unsqueeze(1)

# Visualize
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', label='Class 0')
plt.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', label='Class 1')
plt.title(f'Training Data ({len(X_train)} samples)')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(X_test[y_test==0, 0], X_test[y_test==0, 1], c='blue', label='Class 0')
plt.scatter(X_test[y_test==1, 0], X_test[y_test==1, 1], c='red', label='Class 1')
plt.title(f'Test Data ({len(X_test)} samples)')
plt.legend()
plt.tight_layout()
plt.show()

## Part 2: Define Models with Different Regularization

In [None]:
class MLP(nn.Module):
    """Overparameterized MLP for demonstrating overfitting"""
    def __init__(self, hidden_size=128, dropout_rate=0.0):
        super().__init__()
        self.fc1 = nn.Linear(2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc4(x))
        return x

def train_model(model, X_train, y_train, X_test, y_test, 
                epochs=500, lr=0.01, weight_decay=0.0, l1_lambda=0.0):
    """Train model and return history"""
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    train_losses, test_losses = [], []
    train_accs, test_accs = [], []
    
    for epoch in range(epochs):
        # Training
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        
        # Add L1 regularization manually
        if l1_lambda > 0:
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm
        
        loss.backward()
        optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            train_pred = model(X_train)
            test_pred = model(X_test)
            
            train_losses.append(criterion(train_pred, y_train).item())
            test_losses.append(criterion(test_pred, y_test).item())
            
            train_accs.append(((train_pred > 0.5) == y_train).float().mean().item())
            test_accs.append(((test_pred > 0.5) == y_test).float().mean().item())
    
    return train_losses, test_losses, train_accs, test_accs

## Part 3: Compare No Regularization vs L2 (Weight Decay)

In [None]:
# No regularization
model_none = MLP()
train_loss_none, test_loss_none, train_acc_none, test_acc_none = train_model(
    model_none, X_train_t, y_train_t, X_test_t, y_test_t)

# L2 regularization (weight_decay in Adam)
model_l2 = MLP()
train_loss_l2, test_loss_l2, train_acc_l2, test_acc_l2 = train_model(
    model_l2, X_train_t, y_train_t, X_test_t, y_test_t, weight_decay=0.01)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(train_loss_none, 'b-', label='Train (No reg)', alpha=0.7)
axes[0].plot(test_loss_none, 'b--', label='Test (No reg)', alpha=0.7)
axes[0].plot(train_loss_l2, 'r-', label='Train (L2)', alpha=0.7)
axes[0].plot(test_loss_l2, 'r--', label='Test (L2)', alpha=0.7)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss: No Regularization vs L2')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(train_acc_none, 'b-', label='Train (No reg)', alpha=0.7)
axes[1].plot(test_acc_none, 'b--', label='Test (No reg)', alpha=0.7)
axes[1].plot(train_acc_l2, 'r-', label='Train (L2)', alpha=0.7)
axes[1].plot(test_acc_l2, 'r--', label='Test (L2)', alpha=0.7)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy: No Regularization vs L2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final Test Accuracy - No Reg: {test_acc_none[-1]:.2%}, L2: {test_acc_l2[-1]:.2%}")

## Part 4: Visualize Decision Boundaries

In [None]:
def plot_decision_boundary(model, X, y, title):
    """Plot decision boundary of a model"""
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                         np.linspace(y_min, y_max, 200))
    
    model.eval()
    with torch.no_grad():
        grid = torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()])
        Z = model(grid).numpy().reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, levels=50, cmap='RdBu', alpha=0.7)
    plt.colorbar(label='P(class=1)')
    plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', edgecolor='white', s=50)
    plt.scatter(X[y==1, 0], X[y==1, 1], c='red', edgecolor='white', s=50)
    plt.title(title)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plt.sca(axes[0])
plot_decision_boundary(model_none, X_test, y_test, 'No Regularization (Overfitting)')

plt.sca(axes[1])
plot_decision_boundary(model_l2, X_test, y_test, 'L2 Regularization (Smoother)')

plt.tight_layout()
plt.show()

print("Notice: Without regularization, boundary is more complex (overfitting to noise)")

## Part 5: L1 Regularization (Sparsity)

In [None]:
# L1 regularization
model_l1 = MLP()
train_loss_l1, test_loss_l1, train_acc_l1, test_acc_l1 = train_model(
    model_l1, X_train_t, y_train_t, X_test_t, y_test_t, l1_lambda=0.001)

# Compare weight distributions
def get_weights(model):
    return torch.cat([p.flatten() for p in model.parameters()]).detach().numpy()

weights_none = get_weights(model_none)
weights_l1 = get_weights(model_l1)
weights_l2 = get_weights(model_l2)

fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].hist(weights_none, bins=50, alpha=0.7, color='blue')
axes[0].set_title(f'No Regularization\n{np.sum(np.abs(weights_none) < 0.01)} near-zero weights')
axes[0].set_xlabel('Weight value')

axes[1].hist(weights_l2, bins=50, alpha=0.7, color='green')
axes[1].set_title(f'L2 Regularization\n{np.sum(np.abs(weights_l2) < 0.01)} near-zero weights')
axes[1].set_xlabel('Weight value')

axes[2].hist(weights_l1, bins=50, alpha=0.7, color='red')
axes[2].set_title(f'L1 Regularization\n{np.sum(np.abs(weights_l1) < 0.01)} near-zero weights')
axes[2].set_xlabel('Weight value')

plt.tight_layout()
plt.show()

print("L1 creates SPARSE weights (many exactly zero) - useful for feature selection!")

## Part 6: Dropout

In [None]:
# Dropout
model_dropout = MLP(dropout_rate=0.5)
train_loss_drop, test_loss_drop, train_acc_drop, test_acc_drop = train_model(
    model_dropout, X_train_t, y_train_t, X_test_t, y_test_t)

# Compare all methods
fig, ax = plt.subplots(figsize=(10, 6))

methods = ['No Reg', 'L2', 'L1', 'Dropout']
train_accs = [train_acc_none[-1], train_acc_l2[-1], train_acc_l1[-1], train_acc_drop[-1]]
test_accs = [test_acc_none[-1], test_acc_l2[-1], test_acc_l1[-1], test_acc_drop[-1]]
gaps = [t - te for t, te in zip(train_accs, test_accs)]

x = np.arange(len(methods))
width = 0.35

ax.bar(x - width/2, train_accs, width, label='Train Accuracy', color='blue', alpha=0.7)
ax.bar(x + width/2, test_accs, width, label='Test Accuracy', color='green', alpha=0.7)

ax.set_ylabel('Accuracy')
ax.set_xlabel('Regularization Method')
ax.set_title('Train vs Test Accuracy by Regularization Method')
ax.set_xticks(x)
ax.set_xticklabels(methods)
ax.legend()
ax.set_ylim(0.5, 1.0)

# Annotate gaps
for i, gap in enumerate(gaps):
    ax.annotate(f'Gap: {gap:.1%}', (i, max(train_accs[i], test_accs[i]) + 0.02), ha='center')

plt.tight_layout()
plt.show()

print("Smaller train-test gap = less overfitting = better generalization!")

## Part 7: The Bayesian Interpretation

In [None]:
# Visualize L1 vs L2 priors
x = np.linspace(-3, 3, 1000)

# Gaussian prior (L2)
gaussian = np.exp(-x**2 / 2) / np.sqrt(2 * np.pi)

# Laplace prior (L1)
laplace = np.exp(-np.abs(x)) / 2

plt.figure(figsize=(10, 5))
plt.plot(x, gaussian, 'b-', linewidth=2, label='Gaussian (L2 prior)')
plt.plot(x, laplace, 'r-', linewidth=2, label='Laplace (L1 prior)')
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)
plt.xlabel('Weight value θ')
plt.ylabel('Prior probability P(θ)')
plt.title('Bayesian Interpretation: L2 = Gaussian Prior, L1 = Laplace Prior')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("Laplace (L1) has a SPIKE at zero → encourages exact sparsity")
print("Gaussian (L2) is smooth at zero → shrinks but doesn't zero out")

## Summary

**Key takeaways:**
1. **No regularization** → model memorizes training data (overfits)
2. **L2 (weight decay)** → shrinks all weights, smoother decision boundary
3. **L1** → creates sparse weights (some exactly zero)
4. **Dropout** → trains ensemble of sub-networks

All methods reduce the train-test gap by limiting model capacity!