# Prerequisites & Mathematical Foundations - Interactive Notebook

Welcome! This notebook provides hands-on practice with the mathematical concepts underlying transformers. Work through each cell to build your intuition.

## Setup

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Helper function for pretty printing
def print_matrix(matrix, name="Matrix"):
    print(f"{name} (shape: {matrix.shape}):")
    print(matrix)
    print()

## 1. Linear Algebra Essentials

### 1.1 Vectors and Dot Products

In [None]:
# Create two vectors
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])

print("v1:", v1)
print("v2:", v2)

# Dot product - three ways to compute it
dot_method1 = np.dot(v1, v2)
dot_method2 = v1 @ v2  # Python 3.5+ matrix multiplication operator
dot_method3 = sum(a * b for a, b in zip(v1, v2))

print(f"\nDot product:")
print(f"  np.dot: {dot_method1}")
print(f"  @ operator: {dot_method2}")
print(f"  Manual: {dot_method3}")

# Visualize what dot product means
print(f"\nBreaking it down: {v1[0]}×{v2[0]} + {v1[1]}×{v2[1]} + {v1[2]}×{v2[2]} = {dot_method1}")

### Exercise 1: Compute these dot products by hand, then verify with code

In [None]:
# Try these exercises:
a = np.array([2, 3])
b = np.array([4, -1])

# YOUR CODE: Compute a · b
# Expected: 2*4 + 3*(-1) = 8 - 3 = 5

dot_ab = # YOUR CODE HERE
print(f"a · b = {dot_ab}")

# What happens with orthogonal vectors?
c = np.array([1, 0])
d = np.array([0, 1])

dot_cd = # YOUR CODE HERE
print(f"\nc · d = {dot_cd} (orthogonal vectors have dot product = 0)")

### 1.2 Understanding Similarity Through Dot Products

In [None]:
# This is crucial for understanding attention!
# Let's see how dot product measures similarity

# Word vectors (simplified 2D for visualization)
word_vectors = {
    "king": np.array([0.9, 0.1]),
    "queen": np.array([0.85, 0.15]),
    "man": np.array([0.8, -0.1]),
    "woman": np.array([0.75, -0.05]),
    "apple": np.array([-0.2, 0.9]),
    "orange": np.array([-0.15, 0.85])
}

# Compute similarity between king and all other words
query = "king"
similarities = {}

for word, vector in word_vectors.items():
    similarity = np.dot(word_vectors[query], vector)
    similarities[word] = similarity

# Sort by similarity
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

print(f"Words most similar to '{query}':")
for word, sim in sorted_similarities:
    print(f"  {word}: {sim:.3f}")

# Visualize
plt.figure(figsize=(8, 6))
for word, vec in word_vectors.items():
    plt.scatter(vec[0], vec[1])
    plt.annotate(word, (vec[0], vec[1]), xytext=(5, 5), textcoords='offset points')

plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='-', alpha=0.3)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Word Vectors in 2D Space')
plt.grid(True, alpha=0.3)
plt.show()

### 1.3 Matrix Multiplication

In [None]:
# Matrix multiplication step by step
A = np.array([[1, 2],
              [3, 4]])
B = np.array([[5, 6],
              [7, 8]])

print_matrix(A, "A")
print_matrix(B, "B")

# Manual multiplication to understand the process
C_manual = np.zeros((2, 2))

# C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0]
C_manual[0, 0] = A[0, 0] * B[0, 0] + A[0, 1] * B[1, 0]
print(f"C[0,0] = {A[0,0]}×{B[0,0]} + {A[0,1]}×{B[1,0]} = {C_manual[0,0]}")

# Fill in the rest
C_manual[0, 1] = A[0, 0] * B[0, 1] + A[0, 1] * B[1, 1]
C_manual[1, 0] = A[1, 0] * B[0, 0] + A[1, 1] * B[1, 0]
C_manual[1, 1] = A[1, 0] * B[0, 1] + A[1, 1] * B[1, 1]

print("\nManual result:")
print_matrix(C_manual, "C (manual)")

# NumPy result
C_numpy = A @ B
print_matrix(C_numpy, "C (NumPy)")

# Verify they're the same
print(f"Results match: {np.allclose(C_manual, C_numpy)}")

### Exercise 2: Matrix Multiplication Practice

In [None]:
# Given these matrices, predict the output shape and compute the result
X = np.array([[1, 2, 3],
              [4, 5, 6]])  # Shape: (2, 3)

W = np.array([[0.1, 0.2],
              [0.3, 0.4],
              [0.5, 0.6]])  # Shape: (3, 2)

# YOUR CODE: What will be the shape of X @ W?
# Remember: (2,3) @ (3,2) = (2,2)

result = # YOUR CODE HERE
print(f"Result shape: {result.shape}")
print_matrix(result, "X @ W")

# This is exactly what happens in neural networks!
# X could be your input data (2 samples, 3 features)
# W could be your weight matrix (3 inputs, 2 outputs)

## 2. Building a Neural Network from Scratch

### 2.1 The Forward Pass

In [None]:
class SimpleNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights with small random values
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
        
    def relu(self, x):
        """ReLU activation: max(0, x)"""
        return np.maximum(0, x)
    
    def sigmoid(self, x):
        """Sigmoid activation: 1 / (1 + e^(-x))"""
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def forward(self, X, verbose=False):
        """Forward pass through the network"""
        # Layer 1: Input -> Hidden
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)
        
        # Layer 2: Hidden -> Output
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.sigmoid(self.z2)
        
        if verbose:
            print(f"Input shape: {X.shape}")
            print(f"After layer 1: {self.a1.shape}")
            print(f"Output shape: {self.a2.shape}")
            print(f"\nLayer 1 activations (first sample):\n{self.a1[0]}")
            print(f"\nOutput (first sample): {self.a2[0]}")
        
        return self.a2

# Create a small network
nn = SimpleNeuralNetwork(input_size=3, hidden_size=4, output_size=2)

# Sample input (2 samples, 3 features each)
X = np.array([[0.5, 0.3, 0.2],
              [0.8, 0.1, 0.9]])

# Forward pass
output = nn.forward(X, verbose=True)

### 2.2 Visualizing the Network

In [None]:
# Let's visualize what happens in each layer
def visualize_activations(nn, X):
    # Forward pass
    output = nn.forward(X)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Input
    axes[0].imshow(X, cmap='RdBu', aspect='auto')
    axes[0].set_title('Input')
    axes[0].set_xlabel('Features')
    axes[0].set_ylabel('Samples')
    axes[0].colorbar = plt.colorbar(axes[0].images[0], ax=axes[0])
    
    # Hidden layer activations
    axes[1].imshow(nn.a1, cmap='RdBu', aspect='auto')
    axes[1].set_title('Hidden Layer (after ReLU)')
    axes[1].set_xlabel('Hidden Units')
    axes[1].set_ylabel('Samples')
    plt.colorbar(axes[1].images[0], ax=axes[1])
    
    # Output
    axes[2].imshow(nn.a2, cmap='RdBu', aspect='auto')
    axes[2].set_title('Output (after Sigmoid)')
    axes[2].set_xlabel('Output Units')
    axes[2].set_ylabel('Samples')
    plt.colorbar(axes[2].images[0], ax=axes[2])
    
    plt.tight_layout()
    plt.show()

# Generate more samples
X_batch = np.random.randn(10, 3)
visualize_activations(nn, X_batch)

## 3. Understanding Backpropagation

### 3.1 Computing Gradients Step by Step

In [None]:
# Let's implement backpropagation for our simple network
class NeuralNetworkWithBackprop(SimpleNeuralNetwork):
    def backward(self, X, y_true, learning_rate=0.01):
        m = X.shape[0]  # number of samples
        
        # Compute the gradient of the loss w.r.t output
        # For binary cross-entropy with sigmoid, this simplifies to:
        dz2 = self.a2 - y_true
        
        # Gradients for layer 2
        dW2 = (1/m) * self.a1.T @ dz2
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Backpropagate to layer 1
        da1 = dz2 @ self.W2.T
        dz1 = da1 * (self.z1 > 0)  # ReLU derivative
        
        # Gradients for layer 1
        dW1 = (1/m) * X.T @ dz1
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        # Update weights
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        
        return dW1, dW2
    
    def compute_loss(self, y_pred, y_true):
        # Binary cross-entropy
        epsilon = 1e-7
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Example: Training on XOR problem
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([[0], [1], [1], [0]])

# Create network
nn_backprop = NeuralNetworkWithBackprop(input_size=2, hidden_size=4, output_size=1)

# Training loop
losses = []
for epoch in range(1000):
    # Forward pass
    y_pred = nn_backprop.forward(X_xor)
    
    # Compute loss
    loss = nn_backprop.compute_loss(y_pred, y_xor)
    losses.append(loss)
    
    # Backward pass
    nn_backprop.backward(X_xor, y_xor, learning_rate=0.5)
    
    if epoch % 200 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Plot training progress
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Training Loss over Time')
plt.xlabel('Epoch')
plt.ylabel('Binary Cross-Entropy Loss')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

# Check final predictions
print("\nFinal predictions:")
final_pred = nn_backprop.forward(X_xor)
for i, (x, pred, true) in enumerate(zip(X_xor, final_pred, y_xor)):
    print(f"Input: {x}, Predicted: {pred[0]:.3f}, True: {true[0]}")

### 3.2 Gradient Flow Visualization

In [None]:
# Visualize how gradients flow through the network
def visualize_gradient_flow(nn, X, y):
    # Forward pass
    y_pred = nn.forward(X)
    
    # Store initial weights
    W1_before = nn.W1.copy()
    W2_before = nn.W2.copy()
    
    # Backward pass
    dW1, dW2 = nn.backward(X, y, learning_rate=0.0)  # Don't update, just get gradients
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Weight matrices
    im1 = axes[0, 0].imshow(W1_before, cmap='RdBu', aspect='auto')
    axes[0, 0].set_title('W1 (Weights)')
    axes[0, 0].set_xlabel('Hidden Units')
    axes[0, 0].set_ylabel('Input Units')
    plt.colorbar(im1, ax=axes[0, 0])
    
    im2 = axes[0, 1].imshow(W2_before, cmap='RdBu', aspect='auto')
    axes[0, 1].set_title('W2 (Weights)')
    axes[0, 1].set_xlabel('Output Units')
    axes[0, 1].set_ylabel('Hidden Units')
    plt.colorbar(im2, ax=axes[0, 1])
    
    # Gradient matrices
    im3 = axes[1, 0].imshow(dW1, cmap='RdBu', aspect='auto')
    axes[1, 0].set_title('dW1 (Gradients)')
    axes[1, 0].set_xlabel('Hidden Units')
    axes[1, 0].set_ylabel('Input Units')
    plt.colorbar(im3, ax=axes[1, 0])
    
    im4 = axes[1, 1].imshow(dW2, cmap='RdBu', aspect='auto')
    axes[1, 1].set_title('dW2 (Gradients)')
    axes[1, 1].set_xlabel('Output Units')
    axes[1, 1].set_ylabel('Hidden Units')
    plt.colorbar(im4, ax=axes[1, 1])
    
    plt.tight_layout()
    plt.show()

# Create a fresh network and visualize
nn_vis = NeuralNetworkWithBackprop(input_size=2, hidden_size=4, output_size=1)
visualize_gradient_flow(nn_vis, X_xor, y_xor)

## 4. Optimization Algorithms

### 4.1 Comparing SGD, Momentum, and Adam

In [None]:
# Implement different optimizers
class Optimizers:
    @staticmethod
    def sgd(params, grads, learning_rate=0.01, state=None):
        """Vanilla SGD"""
        for param, grad in zip(params, grads):
            param -= learning_rate * grad
        return state
    
    @staticmethod
    def momentum(params, grads, learning_rate=0.01, state=None, beta=0.9):
        """SGD with momentum"""
        if state is None:
            state = [np.zeros_like(p) for p in params]
        
        for i, (param, grad) in enumerate(zip(params, grads)):
            state[i] = beta * state[i] + (1 - beta) * grad
            param -= learning_rate * state[i]
        
        return state
    
    @staticmethod
    def adam(params, grads, learning_rate=0.001, state=None, 
             beta1=0.9, beta2=0.999, epsilon=1e-8):
        """Adam optimizer"""
        if state is None:
            state = {
                'm': [np.zeros_like(p) for p in params],
                'v': [np.zeros_like(p) for p in params],
                't': 0
            }
        
        state['t'] += 1
        t = state['t']
        
        for i, (param, grad) in enumerate(zip(params, grads)):
            # Update biased first moment
            state['m'][i] = beta1 * state['m'][i] + (1 - beta1) * grad
            # Update biased second moment
            state['v'][i] = beta2 * state['v'][i] + (1 - beta2) * grad**2
            
            # Bias correction
            m_hat = state['m'][i] / (1 - beta1**t)
            v_hat = state['v'][i] / (1 - beta2**t)
            
            # Update parameters
            param -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        
        return state

# Visualize optimizer behavior on a simple 2D function
def rosenbrock(x, y):
    """Rosenbrock function - a classic test for optimizers"""
    return (1 - x)**2 + 100 * (y - x**2)**2

def rosenbrock_grad(x, y):
    """Gradient of Rosenbrock function"""
    dx = -2 * (1 - x) - 400 * x * (y - x**2)
    dy = 200 * (y - x**2)
    return np.array([dx, dy])

# Run different optimizers
start_point = np.array([-1.0, 2.0])
optimizers = {
    'SGD': (Optimizers.sgd, 0.001),
    'Momentum': (Optimizers.momentum, 0.001),
    'Adam': (Optimizers.adam, 0.01)
}

trajectories = {}
for name, (optimizer, lr) in optimizers.items():
    point = start_point.copy()
    trajectory = [point.copy()]
    state = None
    
    for _ in range(200):
        grad = rosenbrock_grad(point[0], point[1])
        state = optimizer([point], [grad], learning_rate=lr, state=state)
        trajectory.append(point.copy())
    
    trajectories[name] = np.array(trajectory)

# Plot the optimization paths
fig, ax = plt.subplots(figsize=(12, 8))

# Create contour plot
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x, y)
Z = rosenbrock(X, Y)

contours = ax.contour(X, Y, Z, levels=np.logspace(-1, 3, 20), alpha=0.3)

# Plot trajectories
colors = {'SGD': 'red', 'Momentum': 'blue', 'Adam': 'green'}
for name, trajectory in trajectories.items():
    ax.plot(trajectory[:, 0], trajectory[:, 1], 
            'o-', color=colors[name], label=name, 
            markersize=3, alpha=0.7)

# Mark the optimum
ax.plot(1, 1, 'k*', markersize=15, label='Optimum')

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Optimizer Comparison on Rosenbrock Function')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

## 5. PyTorch Comparison

### 5.1 Our Implementation vs PyTorch

In [None]:
# Let's compare our implementation with PyTorch
import torch.nn.functional as F

# Our NumPy implementation
class NumpyNet:
    def __init__(self):
        self.W1 = np.array([[0.1, 0.2], [0.3, 0.4]])
        self.b1 = np.array([[0.1, 0.2]])
        self.W2 = np.array([[0.5], [0.6]])
        self.b2 = np.array([[0.3]])
    
    def forward(self, x):
        z1 = x @ self.W1 + self.b1
        a1 = np.maximum(0, z1)  # ReLU
        z2 = a1 @ self.W2 + self.b2
        return z2

# PyTorch implementation
class TorchNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 2, bias=True)
        self.fc2 = nn.Linear(2, 1, bias=True)
        
        # Set same weights as NumPy version
        with torch.no_grad():
            self.fc1.weight = nn.Parameter(torch.tensor([[0.1, 0.3], [0.2, 0.4]]))
            self.fc1.bias = nn.Parameter(torch.tensor([0.1, 0.2]))
            self.fc2.weight = nn.Parameter(torch.tensor([[0.5, 0.6]]))
            self.fc2.bias = nn.Parameter(torch.tensor([0.3]))
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Test both implementations
test_input = np.array([[1.0, 2.0]])

# NumPy
numpy_net = NumpyNet()
numpy_output = numpy_net.forward(test_input)

# PyTorch
torch_net = TorchNet()
torch_input = torch.tensor(test_input, dtype=torch.float32)
torch_output = torch_net(torch_input)

print("Input:", test_input)
print("NumPy output:", numpy_output)
print("PyTorch output:", torch_output.detach().numpy())
print("\nOutputs match:", np.allclose(numpy_output, torch_output.detach().numpy()))

### 5.2 Automatic Differentiation in PyTorch

In [None]:
# PyTorch's autograd is magic - let's see it in action
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = torch.tensor([4.0, 5.0, 6.0], requires_grad=True)

# Some operations
z = x + y
w = z * z
loss = w.mean()

print("Forward pass:")
print(f"x = {x.data}")
print(f"y = {y.data}")
print(f"z = x + y = {z.data}")
print(f"w = z * z = {w.data}")
print(f"loss = mean(w) = {loss.data}")

# Compute gradients
loss.backward()

print("\nGradients:")
print(f"∂loss/∂x = {x.grad}")
print(f"∂loss/∂y = {y.grad}")

# Verify manually
print("\nManual verification:")
print("loss = mean((x + y)²)")
print("∂loss/∂x = 2(x + y)/3")
manual_grad_x = 2 * (x.data + y.data) / 3
print(f"Manual gradient: {manual_grad_x}")
print(f"Matches autograd: {torch.allclose(x.grad, manual_grad_x)}")

## 6. Key Takeaways

### What You've Learned

In [None]:
# Summary visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 1. Dot product measures similarity
vectors = np.random.randn(5, 2)
similarity_matrix = vectors @ vectors.T
im1 = axes[0, 0].imshow(similarity_matrix, cmap='RdBu')
axes[0, 0].set_title('Dot Products = Similarity Matrix')
plt.colorbar(im1, ax=axes[0, 0])

# 2. Matrix multiplication transforms data
data = np.random.randn(50, 2)
W = np.array([[0.7, -0.7], [0.7, 0.7]])  # Rotation matrix
transformed = data @ W
axes[0, 1].scatter(data[:, 0], data[:, 1], alpha=0.5, label='Original')
axes[0, 1].scatter(transformed[:, 0], transformed[:, 1], alpha=0.5, label='Transformed')
axes[0, 1].set_title('Matrix Multiplication = Transformation')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Neural networks stack transformations
x = np.linspace(-2, 2, 100)
axes[1, 0].plot(x, x, label='Linear')
axes[1, 0].plot(x, np.maximum(0, x), label='ReLU(Linear)')
axes[1, 0].plot(x, 1/(1 + np.exp(-x)), label='Sigmoid(Linear)')
axes[1, 0].set_title('Activation Functions Add Non-linearity')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Gradients flow backward
layer_names = ['Input', 'Hidden 1', 'Hidden 2', 'Output']
gradient_magnitudes = [0.001, 0.01, 0.1, 1.0]
axes[1, 1].bar(layer_names, gradient_magnitudes)
axes[1, 1].set_title('Gradient Flow in Backpropagation')
axes[1, 1].set_ylabel('Gradient Magnitude')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

print("🎉 Congratulations! You now understand:")
print("✓ Linear algebra operations (dot products, matrix multiplication)")
print("✓ How neural networks transform data")
print("✓ Backpropagation and gradient flow")
print("✓ Different optimization algorithms")
print("\n🚀 Ready for the next topic: Sequence Modeling!")

## 7. Exercises

Try these exercises to solidify your understanding:

In [None]:
# Exercise 1: Implement a 3-layer network
# TODO: Create a network with architecture: input(4) -> hidden1(8) -> hidden2(4) -> output(2)

# Exercise 2: Implement dropout
# TODO: Add dropout to the forward pass (hint: randomly zero out activations)

# Exercise 3: Implement batch normalization
# TODO: Normalize activations to have mean=0, std=1

# Exercise 4: Visualize the decision boundary
# TODO: Train a network on 2D data and plot its decision boundary

print("Exercises are waiting for you! 💪")
print("Solutions are in the prerequisites.py file.")