# Convolutional Neural Network (CNN) from Scratch
This project implements a CNN using NumPy to classify handwritten digits from the `digits` dataset. It includes custom implementations of convolutional layers, activation functions, pooling, and softmax layers.

## Step 1: Import Required Libraries
We use NumPy for numerical computations and `sklearn` for loading the dataset and preprocessing.

In [6]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

## Step 2: Load and Preprocess the Dataset
- Normalize pixel values to the range [0, 1].
- One-hot encode the labels for multi-class classification.
- Reshape the input data for CNN compatibility.

In [7]:
# Load and preprocess dataset
digits = load_digits()
X = digits.images / 16.0  # Normalize pixel values
y = digits.target.reshape(-1, 1)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y)

# Reshape for CNN input (N, C=1, H, W)
X = X.reshape(-1, 1, 8, 8)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

## Step 3: Define CNN Components
The CNN consists of:
1. **Convolutional Layer**: Extracts features using filters.
2. **ReLU Activation**: Introduces non-linearity.
3. **Max Pooling**: Reduces spatial dimensions.
4. **Softmax Layer**: Outputs probabilities for each class.

In [8]:
# --- Revised CNN Components ---
class Conv2D:
    def __init__(self, num_filters, filter_size):
        self.num_filters = num_filters
        self.filter_size = filter_size
        # He initialization for filters
        scale = np.sqrt(2. / (filter_size * filter_size))
        self.filters = np.random.randn(num_filters, 1, filter_size, filter_size) * scale
        self.biases = np.zeros(num_filters)
        self.last_input = None

    def iterate_regions(self, image):
        h, w = image.shape[2], image.shape[3]
        for i in range(h - self.filter_size + 1):
            for j in range(w - self.filter_size + 1):
                img_region = image[:, :, i:i + self.filter_size, j:j + self.filter_size]
                yield i, j, img_region

    def forward(self, input):
        self.last_input = input
        n, c, h, w = input.shape
        out_h = h - self.filter_size + 1
        out_w = w - self.filter_size + 1
        output = np.zeros((n, self.num_filters, out_h, out_w))
        
        # Fixed broadcasting: align batch and filter dimensions
        for i, j, region in self.iterate_regions(input):
            # region: (batch, 1, 3, 3), filters: (16, 1, 3, 3)
            # Add dimensions to make them compatible for broadcasting
            output[:, :, i, j] = np.sum(
                region[:, np.newaxis, :, :, :] * self.filters[np.newaxis, :, :, :, :],
                axis=(2, 3, 4)
            ) + self.biases
        return output

    def backward(self, dL_dout, learning_rate):
        n, c, h, w = self.last_input.shape
        _, num_filters, out_h, out_w = dL_dout.shape
        dL_dfilters = np.zeros_like(self.filters)
        dL_dinput = np.zeros_like(self.last_input)
        dL_dbiases = np.zeros_like(self.biases)
        
        for i, j, region in self.iterate_regions(self.last_input):
            # Gradient for filters
            dL_dfilters += np.sum(
                region[:, np.newaxis] * dL_dout[:, :, i, j][:, :, np.newaxis, np.newaxis, np.newaxis],
                axis=0
            )
            # Gradient for biases
            dL_dbiases += np.sum(dL_dout[:, :, i, j], axis=0)
            # Gradient for input
            dL_dinput[:, :, i:i+self.filter_size, j:j+self.filter_size] += np.sum(
                self.filters[np.newaxis, :, :, :, :] * dL_dout[:, :, i, j][:, :, np.newaxis, np.newaxis, np.newaxis],
                axis=1
            )
        
        # Update parameters
        self.filters -= learning_rate * dL_dfilters / n
        self.biases -= learning_rate * dL_dbiases / n
        
        return dL_dinput

class ReLU:
    def __init__(self):
        self.last_input = None

    def forward(self, input):
        self.last_input = input
        return np.maximum(0, input)

    def backward(self, dL_dout):
        return dL_dout * (self.last_input > 0)

class MaxPool2:
    def __init__(self):
        self.last_input = None
        self.mask = None

    def iterate_regions(self, image):
        n, c, h, w = image.shape
        for i in range(0, h, 2):
            for j in range(0, w, 2):
                region = image[:, :, i:i+2, j:j+2]
                yield i//2, j//2, region

    def forward(self, input):
        self.last_input = input
        n, c, h, w = input.shape
        output = np.zeros((n, c, h//2, w//2))
        self.mask = np.zeros_like(input)
        
        for i, j, region in self.iterate_regions(input):
            max_vals = np.max(region, axis=(2, 3), keepdims=True)
            pool_out = max_vals.reshape(n, c, 1, 1)
            output[:, :, i, j] = pool_out.squeeze()
            mask_region = (region == max_vals)
            self.mask[:, :, i*2:i*2+2, j*2:j*2+2] = mask_region
        return output

    def backward(self, dL_dout):
        dL_dinput = np.zeros_like(self.last_input)
        for i, j, _ in self.iterate_regions(self.last_input):
            mask_region = self.mask[:, :, i*2:i*2+2, j*2:j*2+2]
            grad_region = dL_dout[:, :, i, j][:, :, None, None] * mask_region
            dL_dinput[:, :, i*2:i*2+2, j*2:j*2+2] += grad_region
        return dL_dinput

class Softmax:
    def __init__(self, input_len, nodes):
        # Xavier initialization
        scale = np.sqrt(2. / (input_len + nodes))
        self.weights = np.random.randn(input_len, nodes) * scale
        self.biases = np.zeros(nodes)
        self.last_input = None

    def forward(self, input):
        self.last_input = input
        input_flat = input.reshape(input.shape[0], -1)
        totals = np.dot(input_flat, self.weights) + self.biases
        exp = np.exp(totals - np.max(totals, axis=1, keepdims=True))
        return exp / np.sum(exp, axis=1, keepdims=True)

    def backward(self, dL_dout, learning_rate):
        n = dL_dout.shape[0]
        input_flat = self.last_input.reshape(n, -1)
        
        # Gradient calculations
        dL_dweights = np.dot(input_flat.T, dL_dout)
        dL_dbiases = np.sum(dL_dout, axis=0)
        dL_dinput = np.dot(dL_dout, self.weights.T).reshape(self.last_input.shape)
        
        # Update parameters
        self.weights -= learning_rate * dL_dweights / n
        self.biases -= learning_rate * dL_dbiases / n
        
        return dL_dinput

## Step 4: Initialize CNN Layers
We define the CNN architecture with one convolutional layer, ReLU activation, max pooling, and a softmax output layer.

In [9]:
# --- Initialize CNN Layers ---
conv = Conv2D(num_filters=16, filter_size=3)  # Increased filters
relu = ReLU()
pool = MaxPool2()
softmax = Softmax(input_len=16*3*3, nodes=10)

## Step 5: Train the CNN
The model is trained using mini-batch gradient descent for 100 epochs or until it reaches 90% accuracy.

In [10]:
# --- Training Parameters ---
epochs = 100
learning_rate = 0.001
batch_size = 32
target_accuracy = 0.90

# --- Training Loop with Mini-Batches ---
n_train = len(X_train)
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    
    # Shuffle training data
    indices = np.random.permutation(n_train)
    X_shuffled = X_train[indices]
    y_shuffled = y_train[indices]
    
    for i in range(0, n_train, batch_size):
        # Get mini-batch
        X_batch = X_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]
        
        # Forward pass
        conv_out = conv.forward(X_batch)
        relu_out = relu.forward(conv_out)
        pool_out = pool.forward(relu_out)
        probs = softmax.forward(pool_out)
        
        # Loss and accuracy
        loss = -np.sum(y_batch * np.log(probs + 1e-8))
        total_loss += loss
        correct += np.sum(np.argmax(probs, axis=1) == np.argmax(y_batch, axis=1))
        
        # Backward pass
        d_softmax = softmax.backward(probs - y_batch, learning_rate)
        d_pool = pool.backward(d_softmax)
        d_relu = relu.backward(d_pool)
        d_conv = conv.backward(d_relu, learning_rate)
    
    # Calculate epoch metrics
    acc = correct / n_train
    avg_loss = total_loss / n_train
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Accuracy: {acc*100:.2f}%")
    
    if acc >= target_accuracy:
        print(f"Reached {acc*100:.2f}% accuracy at epoch {epoch+1}")
        break

Epoch 1 | Loss: 2.6842 | Accuracy: 9.88%
Epoch 2 | Loss: 2.5243 | Accuracy: 8.28%
Epoch 3 | Loss: 2.4569 | Accuracy: 7.38%
Epoch 4 | Loss: 2.4147 | Accuracy: 8.28%
Epoch 5 | Loss: 2.3820 | Accuracy: 8.84%
Epoch 6 | Loss: 2.3543 | Accuracy: 9.53%
Epoch 7 | Loss: 2.3290 | Accuracy: 9.88%
Epoch 8 | Loss: 2.3048 | Accuracy: 10.65%
Epoch 9 | Loss: 2.2818 | Accuracy: 11.69%
Epoch 10 | Loss: 2.2593 | Accuracy: 12.39%
Epoch 11 | Loss: 2.2369 | Accuracy: 13.22%
Epoch 12 | Loss: 2.2154 | Accuracy: 14.61%
Epoch 13 | Loss: 2.1936 | Accuracy: 15.59%
Epoch 14 | Loss: 2.1727 | Accuracy: 17.12%
Epoch 15 | Loss: 2.1517 | Accuracy: 17.95%
Epoch 16 | Loss: 2.1309 | Accuracy: 19.62%
Epoch 17 | Loss: 2.1102 | Accuracy: 21.29%
Epoch 18 | Loss: 2.0901 | Accuracy: 22.41%
Epoch 19 | Loss: 2.0701 | Accuracy: 24.08%
Epoch 20 | Loss: 2.0501 | Accuracy: 24.98%
Epoch 21 | Loss: 2.0305 | Accuracy: 27.14%
Epoch 22 | Loss: 2.0110 | Accuracy: 29.37%
Epoch 23 | Loss: 1.9917 | Accuracy: 30.97%
Epoch 24 | Loss: 1.9725 | A

## Step 6: Evaluate the Model
After training, the model is evaluated on the test set to calculate its accuracy.

In [11]:
# --- Evaluation ---
correct_test = 0
for i in range(0, len(X_test), batch_size):
    X_batch = X_test[i:i+batch_size]
    y_batch = y_test[i:i+batch_size]
    
    conv_out = conv.forward(X_batch)
    relu_out = relu.forward(conv_out)
    pool_out = pool.forward(relu_out)
    probs = softmax.forward(pool_out)
    
    correct_test += np.sum(np.argmax(probs, axis=1) == np.argmax(y_batch, axis=1))

test_acc = correct_test / len(X_test)
print(f"\nTest Accuracy: {test_acc*100:.2f}%")


Test Accuracy: 85.28%
