# Implementing Backpropagation with SGD from Scratch

This notebook demonstrates the implementation of backpropagation with Stochastic Gradient Descent (SGD) using only NumPy.
We solve the XOR problem using a simple feedforward neural network with one hidden layer.

In [2]:
import numpy as np
import matplotlib.pyplot as plt

## 1. Neural Network Architecture

We'll implement a simple feedforward neural network with configurable layers.

In [3]:
class NeuralNetwork:
    def __init__(self, layer_sizes, learning_rate=0.01):
        """
        Initialize a neural network with specified layer sizes.
        
        Parameters:
        - layer_sizes: List of integers where each integer represents the number of neurons in a layer
                      (e.g., [2, 3, 1] means 2 neurons in input layer, 3 in hidden layer, 1 in output layer)
        - learning_rate: Learning rate for SGD
        """
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.num_layers = len(layer_sizes)
        
        # Initialize weights and biases with random values
        self.weights = []
        self.biases = []
        
        # Xavier initialization for weights
        for i in range(1, self.num_layers):
            # Initialize with small random values, scaled by sqrt(1/n)
            scale = np.sqrt(1.0 / layer_sizes[i-1])
            self.weights.append(np.random.randn(layer_sizes[i-1], layer_sizes[i]) * scale)
            self.biases.append(np.zeros((1, layer_sizes[i])))
        
        # Lists to store values during forward pass (for backward pass)
        self.z_values = []  # Weighted inputs
        self.activations = []  # Outputs of each layer

## 2. Activation Functions

In [4]:
def sigmoid(z):
    """
    Sigmoid activation function.
    """
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(z):
    """
    Derivative of sigmoid function.
    """
    sig = sigmoid(z)
    return sig * (1 - sig)

def relu(z):
    """
    ReLU activation function.
    """
    return np.maximum(0, z)

def relu_derivative(z):
    """
    Derivative of ReLU function.
    """
    return np.where(z > 0, 1, 0)

## 3. Forward Propagation

In [5]:
def forward_propagation(self, X):
    """
    Perform forward propagation through the network.
    
    Parameters:
    - X: Input data (batch)
    
    Returns:
    - Output of the network
    """
    # Reset stored values
    self.z_values = []
    self.activations = [X]  # First activation is the input
    
    # Propagate through each layer
    activation = X
    for i in range(self.num_layers - 1):
        # Calculate weighted input
        z = np.dot(activation, self.weights[i]) + self.biases[i]
        self.z_values.append(z)
        
        # Apply activation function
        if i < self.num_layers - 2:  # Hidden layers use ReLU
            activation = relu(z)
        else:  # Output layer uses sigmoid
            activation = sigmoid(z)
            
        self.activations.append(activation)
    
    return activation

# Add the forward_propagation method to the NeuralNetwork class
NeuralNetwork.forward_propagation = forward_propagation

## 4. Loss Function

In [6]:
def compute_loss(self, y_true, y_pred):
    """
    Compute binary cross-entropy loss.
    
    Parameters:
    - y_true: True labels
    - y_pred: Predicted labels
    
    Returns:
    - Loss value
    """
    # Clip predictions to avoid log(0)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    # Binary cross-entropy loss
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

# Add the compute_loss method to the NeuralNetwork class
NeuralNetwork.compute_loss = compute_loss

## 5. Backpropagation

In [7]:
def backpropagation(self, X, y):
    """
    Perform backpropagation to compute gradients.
    
    Parameters:
    - X: Input data (batch)
    - y: True labels
    
    Returns:
    - Gradients for weights and biases
    """
    batch_size = X.shape[0]
    
    # Initialize gradients
    nabla_w = [np.zeros(w.shape) for w in self.weights]
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    
    # Forward pass
    y_pred = self.forward_propagation(X)
    
    # Backward pass
    # Output layer error (delta)
    delta = (y_pred - y) * sigmoid_derivative(self.z_values[-1])
    
    # Last layer gradients
    nabla_w[-1] = np.dot(self.activations[-2].T, delta) / batch_size
    nabla_b[-1] = np.sum(delta, axis=0, keepdims=True) / batch_size
    
    # Propagate error backwards through the network
    for l in range(2, self.num_layers):
        # Compute delta for current layer
        z = self.z_values[-l]
        delta = np.dot(delta, self.weights[-l+1].T) * relu_derivative(z)
        
        # Compute gradients
        nabla_w[-l] = np.dot(self.activations[-l-1].T, delta) / batch_size
        nabla_b[-l] = np.sum(delta, axis=0, keepdims=True) / batch_size
    
    return nabla_w, nabla_b

# Add the backpropagation method to the NeuralNetwork class
NeuralNetwork.backpropagation = backpropagation

## 6. Stochastic Gradient Descent (SGD)

In [8]:
def sgd_update(self, X_batch, y_batch):
    """
    Update weights and biases using SGD.
    
    Parameters:
    - X_batch: Input data batch
    - y_batch: True labels batch
    """
    # Compute gradients
    nabla_w, nabla_b = self.backpropagation(X_batch, y_batch)
    
    # Update weights and biases
    for i in range(len(self.weights)):
        self.weights[i] -= self.learning_rate * nabla_w[i]
        self.biases[i] -= self.learning_rate * nabla_b[i]

# Add the sgd_update method to the NeuralNetwork class
NeuralNetwork.sgd_update = sgd_update

## 7. Training Loop

In [9]:
def train(self, X, y, epochs=1000, batch_size=32, verbose=True):
    """
    Train the neural network using SGD.
    
    Parameters:
    - X: Training data
    - y: Training labels
    - epochs: Number of training epochs
    - batch_size: Size of mini-batches
    - verbose: Whether to print progress
    
    Returns:
    - History of losses
    """
    n_samples = X.shape[0]
    losses = []
    
    for epoch in range(epochs):
        # Shuffle data for each epoch
        indices = np.random.permutation(n_samples)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        # Mini-batch training
        for i in range(0, n_samples, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            
            # Update weights and biases
            self.sgd_update(X_batch, y_batch)
        
        # Compute loss after each epoch
        y_pred = self.forward_propagation(X)
        loss = self.compute_loss(y, y_pred)
        losses.append(loss)
        
        # Print progress
        if verbose and (epoch % 100 == 0 or epoch == epochs - 1):
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    return losses

# Add the train method to the NeuralNetwork class
NeuralNetwork.train = train

## 8. Predict Method

In [10]:
def predict(self, X):
    """
    Make predictions on data.
    
    Parameters:
    - X: Input data
    
    Returns:
    - Predictions
    """
    return self.forward_propagation(X)

# Add the predict method to the NeuralNetwork class
NeuralNetwork.predict = predict

## 9. Example: XOR Problem

Apply the implemented neural network to solve the XOR problem.

1. define points and labels for XOR problem
2. train the neural network
3. plot the loss curve

## 10. Visualizing Decision Boundary

plot the decision boundary of the trained neural network