In [2]:
import torchvision
import torchvision.transforms as transforms
import numpy as np

# **The Mathematical Derivation for a 3-Layer Network**

Let's assume a 3-layer architecture: $X \longrightarrow H_1 \longrightarrow H_2 \longrightarrow \text{Output}$


### **1. Forward Pass Equations**

* **Layer 1**: $Z^{(1)} = W^{(1)} A^{(0)} + b^{(1)}$ , where $\quad A^{(0)} = X$
* **Activation 1**: $A^{(1)} = \sigma\!\left(Z^{(1)}\right)$ (ReLU)
* **Layer 2**: $Z^{(2)} = W^{(2)} A^{(1)} + b^{(2)}$
* **Activation 2**: $A^{(2)} = \sigma\!\left(Z^{(2)}\right)$ (ReLU)
* **Layer 3 (Output)**: $Z^{(3)} = W^{(3)} A^{(2)} + b^{(3)}$
* **Final Prediction**: $\hat{y} = \operatorname{Softmax}\!\left(Z^{(3)}\right)$

### **2. The Loss Function (Cross-Entropy)**

$$\mathcal{L}(y, \hat{y}) = -\sum_{i=1}^{K} y_i \log(\hat{y}_i)$$

### **3. Backpropagation (Chain Rule)**

For the output layer, the derivative of the Loss with respect to the pre-activation  simplifies elegantly to:
$$\delta^{(3)} = \frac{\partial \mathcal{L}}{\partial Z^{(3)}} = \hat{y} - y$$

Now, we propagate this error backward to find the gradients for the weights:

**For Layer 3:**

* $\frac{\partial \mathcal{L}}{\partial W^{(3)}} = \delta^{(3)} \left(A^{(2)}\right)^T$
* $\frac{\partial \mathcal{L}}{\partial b^{(3)}} = \delta^{(3)}$

**For Layer 2 (Hidden):**

* $\delta^{(2)} = \left(W^{(3)}\right)^T \delta^{(3)} \odot \sigma'\!\left(Z^{(2)}\right)$
* $\frac{\partial \mathcal{L}}{\partial W^{(2)}} = \delta^{(2)} \left(A^{(1)}\right)^T$
* $\frac{\partial \mathcal{L}}{\partial b^{(2)}} = \delta^{(2)}$

**For Layer 1 (Hidden):**

* $\delta^{(1)} = \left(W^{(2)}\right)^T \delta^{(2)} \odot \sigma'\!\left(Z^{(1)}\right)$
* $\frac{\partial \mathcal{L}}{\partial W^{(1)}} = \delta^{(1)} \left(A^{(0)}\right)^T$
* $\frac{\partial \mathcal{L}}{\partial b^{(1)}} = \delta^{(1)}$

> **Note**: The symbol $\odot$ represents the **Hadamard Product** (element-wise multiplication). This is crucial because each neuronâ€™s error is gated by the derivative of its own activation function.


In [1]:
class ScratchNet:
    def __init__(self, layers=[3072, 256, 128, 10]):
        self.params = {}
        # He Initialization for all layers
        for i in range(1, len(layers)):
            self.params[f'W{i}'] = np.random.randn(layers[i], layers[i-1]) * np.sqrt(2./layers[i-1])
            self.params[f'b{i}'] = np.zeros((layers[i], 1))

    def relu(self, Z):
        return np.maximum(0, Z)

    def relu_deriv(self, Z):
        return Z > 0

    def softmax(self, Z):
        # Numeric stability: subtract max(Z)
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)

    def forward(self, X):
        # Layer 1
        self.Z1 = np.dot(self.params['W1'], X) + self.params['b1']
        self.A1 = self.relu(self.Z1)
        # Layer 2
        self.Z2 = np.dot(self.params['W2'], self.A1) + self.params['b2']
        self.A2 = self.relu(self.Z2)
        # Layer 3 (Output)
        self.Z3 = np.dot(self.params['W3'], self.A2) + self.params['b3']
        self.A3 = self.softmax(self.Z3)
        return self.A3

    def backward(self, X, Y, A3, lr):
        m = X.shape[1]
        grads = {}
        
        # Output Error (Softmax + Cross-Entropy derivative)
        dZ3 = A3 - Y
        grads['dW3'] = (1/m) * np.dot(dZ3, self.A2.T)
        grads['db3'] = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

        # Layer 2 Error
        dA2 = np.dot(self.params['W3'].T, dZ3)
        dZ2 = dA2 * self.relu_deriv(self.Z2)
        grads['dW2'] = (1/m) * np.dot(dZ2, self.A1.T)
        grads['db2'] = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

        # Layer 1 Error
        dA1 = np.dot(self.params['W2'].T, dZ2)
        dZ1 = dA1 * self.relu_deriv(self.Z1)
        grads['dW1'] = (1/m) * np.dot(dZ1, X.T)
        grads['db1'] = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

        # SGD Update
        for i in range(1, 4):
            self.params[f'W{i}'] -= lr * grads[f'dW{i}']
            self.params[f'b{i}'] -= lr * grads[f'db{i}']

In [3]:
# Download CIFAR-10
train_set = torchvision.datasets.CIFAR10(root='./nn_from_scratch/data', train=True, download=True)
test_set = torchvision.datasets.CIFAR10(root='./nn_from_scratch/data', train=False, download=True)

# Preprocessing: Flatten (32*32*3 = 3072) and Normalize (0-1)
# X shape becomes (3072, 50000)
X_train = train_set.data.reshape(50000, 3072).T / 255.0
X_test = test_set.data.reshape(10000, 3072).T / 255.0

# Extract labels as 1D arrays
Y_train_labels = np.array(train_set.targets)
Y_test_labels = np.array(test_set.targets)

100.0%
  entry = pickle.load(f, encoding="latin1")


In [4]:
def one_hot(Y, num_classes=10):
    # Y is a 1D array of labels, e.g., [3, 0, 9...]
    m = Y.shape[0]
    one_hot_Y = np.zeros((num_classes, m))
    
    # Use advanced indexing: 
    # For each column 'i', set the row 'Y[i]' to 1
    one_hot_Y[Y, np.arange(m)] = 1
    
    return one_hot_Y
# pre-encode training labels
Y_train = one_hot(Y_train_labels)

In [5]:
def get_batches(X, y, batch_size):
    # X shape: (3072, 50000), y shape: (10, 50000)
    m = X.shape[1]
    indices = np.arange(m)
    np.random.shuffle(indices) # Shuffle indices for every epoch
    
    for i in range(0, m, batch_size):
        batch_indices = indices[i:i + batch_size]
        yield X[:, batch_indices], y[:, batch_indices]

In [None]:
# Initialize Model
model = ScratchNet(layers=[3072, 256, 128, 10])
epochs = 20
lr = 0.01
batch_size = 128

for epoch in range(epochs):
    epoch_loss = 0
    num_batches = 0
    
    for x_batch, y_batch in get_batches(X_train, Y_train, batch_size):
        # 1. Forward Pass
        A3 = model.forward(x_batch)
        
        # 2. Compute Loss
        loss = -np.mean(np.sum(y_batch * np.log(A3 + 1e-8), axis=0))
        epoch_loss += loss
        
        # 3. Backward Pass
        model.backward(x_batch, y_batch, A3, lr)
        num_batches += 1
    
    print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss/num_batches:.4f}")

Epoch 1/20 | Loss: 1.9804
Epoch 2/20 | Loss: 1.8110
Epoch 3/20 | Loss: 1.7476
Epoch 4/20 | Loss: 1.7008
Epoch 5/20 | Loss: 1.6646
Epoch 6/20 | Loss: 1.6305
Epoch 7/20 | Loss: 1.6031
Epoch 8/20 | Loss: 1.5784
Epoch 9/20 | Loss: 1.5600
Epoch 10/20 | Loss: 1.5417
Epoch 11/20 | Loss: 1.5168
Epoch 12/20 | Loss: 1.5028
Epoch 13/20 | Loss: 1.4864
Epoch 14/20 | Loss: 1.4699
Epoch 15/20 | Loss: 1.4566
Epoch 16/20 | Loss: 1.4427
Epoch 17/20 | Loss: 1.4328
Epoch 18/20 | Loss: 1.4150
Epoch 19/20 | Loss: 1.4031
Epoch 20/20 | Loss: 1.3955
