In [2]:
## Getting the dataset 
from tensorflow.keras.datasets import mnist
import numpy as np


In [3]:
 #Loads data into training and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [4]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


(60000, 28, 28) (60000,)
(10000, 28, 28) (10000,)


In [5]:
x_train.shape[1]

28

In [6]:
x_train = x_train.reshape((x_train.shape[0], 28, 28, 1)).astype('float32') / 255.0
x_test = x_test.reshape((x_test.shape[0], 28, 28, 1)).astype('float32') / 255.0


In [7]:
x_train.shape

(60000, 28, 28, 1)

In [8]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)


In [9]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], shape=(60000, 10))

In [10]:
x_train.shape

(60000, 28, 28, 1)

In [11]:
y_train.shape

(60000, 10)

In [18]:
class HandwrittenNumberClassification:
    def __init__(self, batch_size = 32, channel_size = 1, kernal_size = 4, filters = 8, stride = 4, pool_size = 4,lr=0.01):
        self.lr = lr
        self.B = batch_size
        self.C_in = channel_size
        self.KH = kernal_size ## Kernal height
        self.KW = kernal_size ## Each kernal width
        self.F = filters
        self.stride = stride
        self.pool_size = pool_size
        self.kernel = np.random.randn(self.F, self.C_in, self.KH, self.KW)
        self.input = None

    
    def softmax(self, x):
        """
        x: (B, num_classes)
        Returns: probabilities for each class per sample
        """
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # stability trick
        probs = exp_x / np.sum(exp_x, axis=1, keepdims=True)
        return probs
    
    def compute_loss(self, probs, y_true):
        """
        probs: (B, num_classes)
        y_true: (B,)
        """
        B = y_true.shape[0]
        y_encoded = np.eye(self.no_of_outputs)[y_true]
        log_likelihood = -np.log(probs[range(B), y_true] + 1e-9)
        loss = np.sum(log_likelihood) / B
        return loss

    


    def generate_feature_map(self, X=None):

        ''' 
        It will take two things:
        1. Kernal = (filters,Channel_input, kernal_Height, kernal_width)
        2. Input = (Batch , sequence_size_width, sequence_size_height, Channel_input)
        
        It will give output : feature_maps: list of shape(batch, Filters, L_output_height, l_output_width)
        L_output_height = Sequence_length - kernal_Height + 1
        L_output_width = Sequence_length - kernal_width + 1

        '''
        if X is None:
            X = self.input

        B, H, W, C_in = X.shape
        F,_,KH,KW = self.kernel.shape
        L_out_h = X.shape[1] - KH + 1
        L_out_w = X.shape[2] - KW + 1

        ## Initialize feature map
        self.feature_map = np.zeros((B,F,L_out_h,L_out_w))

        ## Start the sliding window technique
        for b in range(B):
            for f in range(F):
                feature = np.zeros((L_out_h, L_out_w))
                for c in range(C_in):
                    x = X[b, :, :, c]
                    k = self.kernel[f, c]
                    for i in range(L_out_h):
                        for j in range(L_out_w):
                            window = x[i:i+KH, j:j+KW]
                            feature[i, j] += np.sum(window * k)
                self.feature_map[b, f] = feature
        return self.feature_map
    
    def relu(self,x):
        return np.maximum(0, x)

    def generate_max_pool(self,x):
        '''
        x: shape (B,F,L_out_h, L_out_w)
        
        This function does 2 important work:
        1. Generate a pooled map using pool_size and stride.
        2. Generate Binary map with address of the max value that we will select, it will help us to 
        understand and locate when distributing loss in backpropogation


        Returns : Pooled output of shape (B,F,L_pooled)
        '''

        B,F,LH,LW = x.shape
        L_pool_h = (LH-self.pool_size)//self.stride + 1
        L_pool_w = (LW-self.pool_size)//self.stride + 1

        pooled = np.zeros((B,F,L_pool_h,L_pool_w))
        self.binary_max_pool = np.zeros((B,F,LH,LW)) ## Since we need binary of the shape as feature map

        for b in range(B):
            for f in range(F):
                for i in range(L_pool_h):
                    for j in range(L_pool_w):
                        start_h = i * self.stride
                        end_h = start_h + self.pool_size
                        start_w = j * self.stride
                        end_w = start_w + self.pool_size

                        window = x[b, f, start_h:end_h, start_w:end_w]
                        max_val = np.max(window)
                        pooled[b, f, i, j] = max_val

                        # Binary mask for backprop (optional for now)
                        max_indices = np.where(window == max_val)
                        self.binary_max_pool[b, f, start_h:end_h, start_w:end_w][max_indices] = 1
        return pooled
    
    def generate_flattened_array(self, pooled_array):
        '''
        pooled array : [B,F,L_pooled_H, L_pooled_W]
        '''
        B= pooled_array.shape[0]
        return pooled_array.reshape(B,-1)
    
    def unpool(self,pooled_grad, binary_mask):
        '''
        pooled_grad : (B,F,L_Pooled_H, L_pooled_w)
        Binary_mask : (B,F,LH,LW)
        '''

        B,F,LH,LW = binary_mask.shape
        _,_,L_pool_H,L_pool_W = pooled_grad.shape

        unpooled = np.zeros_like(binary_mask, dtype=pooled_grad.dtype)

        for b in range(B):
            for f in range(F):
                for i in range(L_pool_H):
                    for j in range(L_pool_W):
                        start_h = i * self.stride
                        end_h = start_h + self.pool_size
                        start_w = j * self.stride
                        end_w = start_w + self.pool_size

                        ## Gradient for this reagion
                        grad = pooled_grad[b,f,i,j]

                        ## Distributing gradient only to maximum value of that pool
                        mask_window = binary_mask[b,f,start_h:end_h, start_w:end_w]
                        unpooled[b,f,start_h:end_h, start_w:end_w] += mask_window * grad

        return unpooled
        
    def generate_kernal_loss(self,relu_grad):
        '''
        relu_grad = Shape (B,F,LH,LW)
        Filter = shape(F,C_in,KH,KW)
        input = shape(B,H,W,C_in)

        return dl_dk = shape(F,C_in,KH,KW)
        '''

        B,F,LH,LW = relu_grad.shape
        _,C_in,KH,KW = self.kernel.shape
        _,H,W,_ = self.input.shape 
        L_out_H = LH - KH + 1
        L_out_W = LW - KW + 1

        dl_dk = np.zeros_like(self.kernel)


        for f in range(F):
            for c in range(C_in):
                for b in range(B):
                    x = self.input[b, :, :, c] ## Shape (H,W)
                    grad_out = relu_grad[b,f] ## Shape (LH,LW) : it means take windows from x, each window has it's corresponding loss
                    for i in range(L_out_H):
                        for j in range(L_out_W):
                            ## Let's make a window so that each element from that window will get multiplied to a kernal value and then we can get the loss
                            window = x[i:i+KH,j:j+KW]

                            ## Multiply input gradient with output value and then add to loss of each kernal value
                            dl_dk[f,c] += window * grad_out[i,j]

        ## Taking average of batch to remove the factor of big or small batch
        dl_dk /= self.B

        return dl_dk


    def feature_extraction(self):
        feature_map = self.generate_feature_map()
        self.relu_feature_map = self.relu(feature_map)
        self.max_pool = self.generate_max_pool(self.relu_feature_map)

        return self.max_pool
    
    def forward_decision_making(self, pooled_array):
        '''
        pooled array : [B,F,L_pooled_H, L_pooled_W]
        '''
        B,F,L_pooled_H,L_pooled_W = pooled_array.shape
        #Now Flapttened array

        self.flattened_array = self.generate_flattened_array(pooled_array)

        ## Now pass it with dense layer
        N_in = F * L_pooled_H * L_pooled_W
        self.no_of_outputs = 10
        self.W_dense = np.random.randn(N_in, self.no_of_outputs)
        self.B_dense = np.random.randn(self.no_of_outputs)

        output = self.flattened_array @ self.W_dense + self.B_dense

        return output # [B,no_of_outputs]
        

    def forward(self):
        max_pooled_arr = self.feature_extraction()
        final_decision = self.forward_decision_making(max_pooled_arr)
        self.probabilities = self.softmax(final_decision)
        return self.probabilities
    
    def backprop_decision_making(self,y_train):

        y_one_hot = np.eye(self.no_of_outputs)[y_train]  # Shape: (B, 10)

        # Compute gradient of loss w.r.t. logits (softmax cross-entropy derivative)
        dl_dz = y_one_hot - self.probabilities  # Shape (B, 10)
        dl_dw = self.flattened_array.T @ dl_dz / self.flattened_array.shape[0] ## flattened array's first item in it's shape will give the B value so shape is [F*L_pool_h*L_pool_w,no_of_outputs]
        dl_db = np.mean(dl_dz,axis=0)

        ## Loss with flattened array to just propogate further
        dl_input = dl_dz @ self.W_dense.T # Shape [B,F*L_pool_h*L_pool_w]

        self.W_dense -= self.lr * dl_dw
        self.B_dense -= self.lr * dl_db

        return dl_input
    
    def backprop_convolution(self,y_train):
        dl_input = self.backprop_decision_making(y_train)

        ## We need to unflatten this before going to unpooling and passing loss to kernel
        dl_unflatten_input = dl_input.reshape(self.B, self.F, self.max_pool.shape[2], self.max_pool.shape[3]) ## Shape [B,F,L_Pooled_H, L_pooled_w]

        ## Unpooling of loss
        dl_unpooled_grad = self.unpool(dl_unflatten_input,self.binary_max_pool) ## Shape (B,F,LH,LW)

        ## Calculating loss with relu layer
        dl_drelu_grad = dl_unpooled_grad * (self.relu_feature_map > 0) ## Shape (B,F,LH,LW)

        ## Passing loss to kernal through feature map
        dk = self.generate_kernal_loss(dl_drelu_grad)

        ## Update kernal values
        self.kernel -= self.lr * dk

    def train_model(self, X, y, epochs=5):
        """
        X: shape (num_samples, 28, 28, 1)
        y: shape (num_samples, 10) or (num_samples,) -> one-hot or integer labels
        """
        num_samples = X.shape[0]
        num_batches = num_samples // self.B

        for epoch in range(epochs):
            epoch_loss = 0

            # Shuffle data at the start of each epoch
            indices = np.arange(num_samples)
            np.random.shuffle(indices)
            X = X[indices]
            y = y[indices]

            for batch_idx in range(num_batches):
                # Create mini-batch
                start = batch_idx * self.B
                end = start + self.B
                X_batch = X[start:end]
                y_batch = y[start:end]

                # ✅ If labels are one-hot, convert to integer labels
                if y_batch.ndim > 1:
                    y_batch_indices = np.argmax(y_batch, axis=1)
                else:
                    y_batch_indices = y_batch

                # Save input for convolution
                self.input = X_batch

                # Forward pass
                probs = self.forward()  # -> (B, 10)
            
                # Compute loss
                loss = self.compute_loss(probs, y_batch_indices)
                epoch_loss += loss

                # Backward pass
                self.backprop_convolution(y_batch_indices)

                if batch_idx % 10 == 0:
                    print(f"Epoch {epoch+1}, Batch {batch_idx}/{num_batches}, Loss: {loss:.4f}")

            avg_epoch_loss = epoch_loss / num_batches
            print(f"Epoch [{epoch+1}/{epochs}]  Loss: {avg_epoch_loss:.4f}")

    def predict(self, X):
        """
        X: shape (num_samples, H, W, C)
        Returns: predicted class indices -> (num_samples,)
        """
        num_samples = X.shape[0]
        all_preds = []

        # Process in batches for efficiency
        num_batches = int(np.ceil(num_samples / self.B))
        for batch_idx in range(num_batches):
            start = batch_idx * self.B
            end = min(start + self.B, num_samples)

            X_batch = X[start:end]
            self.input = X_batch

            probs = self.forward()  # -> (B, 10)
            preds = np.argmax(probs, axis=1)
            all_preds.extend(preds)

        return np.array(all_preds)


    def evaluate(self, X_test, y_test):
        """
        X_test: shape (num_samples, H, W, C)
        y_test: shape (num_samples, 10) or (num_samples,) -> one-hot or integer labels
        Returns: accuracy (float)
        """
        # ✅ Convert one-hot labels to integers if necessary
        if y_test.ndim > 1:
            y_true = np.argmax(y_test, axis=1)
        else:
            y_true = y_test

        # Get predictions
        y_pred = self.predict(X_test)

        # Compute accuracy
        accuracy = np.mean(y_pred == y_true)
        print(f"Evaluation Accuracy: {accuracy * 100:.2f}%")
        return accuracy






In [None]:
model = HandwrittenNumberClassification()
## Since it is very slow I used smaller dataset.
model.train_model(X=x_train[:100],y=y_train[:100],epochs=10)

Epoch 1, Batch 0/3, Loss: 17.4368
Epoch [1/10]  Loss: 16.5764
Epoch 2, Batch 0/3, Loss: 18.4559
Epoch [2/10]  Loss: 17.2652
Epoch 3, Batch 0/3, Loss: 17.9892
Epoch [3/10]  Loss: 18.2082
Epoch 4, Batch 0/3, Loss: 15.1071
Epoch [4/10]  Loss: 15.1830
Epoch 5, Batch 0/3, Loss: 18.1809
Epoch [5/10]  Loss: 18.1201
Epoch 6, Batch 0/3, Loss: 16.0751
Epoch [6/10]  Loss: 18.0196
Epoch 7, Batch 0/3, Loss: 14.2472
Epoch [7/10]  Loss: 17.2853
Epoch 8, Batch 0/3, Loss: 13.9096
Epoch [8/10]  Loss: 16.9389
Epoch 9, Batch 0/3, Loss: 17.3788
Epoch [9/10]  Loss: 16.8857
Epoch 10, Batch 0/3, Loss: 19.5680
Epoch [10/10]  Loss: 18.5139
