In [7]:
import numpy as np
import pandas as pd

class Neural_Network:
    
    def __init__(self, epochs, learningRate, batchSize):
        
        self.epochs = epochs
        self.learningRate = learningRate
        self.batchSize = batchSize

        #Reads in MNIST training set
        self.data_initial = pd.read_csv('./mnist-dataset/mnist_train.csv')
        self.labels = (self.data_initial['label']).to_numpy().reshape(60000,1)
        
        # --- FIX 1: NORMALIZE DATA ---
        # Divide by 255.0 so inputs are between 0 and 1. 
        # This prevents the neurons from saturating immediately.
        self.data = (self.data_initial.drop('label', axis=1)).to_numpy().reshape(60000,784) / 255.0
        self.trainingSetSize = self.labels.size
        
        #Reads in MNIST test set
        self.testInitial = pd.read_csv('./mnist-dataset/mnist_test.csv')
        self.testLabels = (self.testInitial['label']).to_numpy().reshape(10000,1)
        # --- FIX 1: NORMALIZE TEST DATA TOO ---
        self.testData = (self.testInitial.drop('label', axis=1)).to_numpy().reshape(10000,784) / 255.0
        self.testSetSize = self.testLabels.size

        self.a_0 = np.zeros([784, 1])

        # Xavier Initialization is fine here
        self.W_1 = np.random.default_rng().normal(loc=0, scale=(1/np.sqrt(784)), size=(16,784)) 
        self.b_1 = np.random.default_rng().normal(loc=0, scale=1, size=(16,1))
        self.z_1 = np.zeros([16, 1])
        self.a_1 = np.zeros([16, 1])
        self.error_1 = np.zeros([16, 1])
        
        self.W_2 = np.random.default_rng().normal(loc=0, scale=(1/np.sqrt(16)), size=(16,16))
        self.b_2 = np.random.default_rng().normal(loc=0, scale=1, size=(16,1))
        self.z_2 = np.zeros([16, 1])
        self.a_2 = np.zeros([16, 1])
        self.error_2 = np.zeros([16, 1])
        
        self.W_3 = np.random.default_rng().normal(loc=0, scale=(1/np.sqrt(16)), size=(10,16))
        self.b_3 = np.random.default_rng().normal(loc=0, scale=1, size=(10,1))
        self.z_3 = np.zeros([10, 1])
        self.a_3 = np.zeros([10, 1])
        self.error_out = np.zeros([10, 1])

        self.dW_1 = np.zeros([16, 784])
        self.dB_1 = np.zeros([16,1])
        self.dW_2 = np.zeros([16,16])
        self.dB_2 = np.zeros([16,1])
        self.dW_3 = np.zeros([10,16])
        self.dB_3 = np.zeros([10,1])
        
        self.y = np.zeros([10,1]).astype(int)

    def sigmoid(self, colVector):
        # Clip values to prevent overflow in exp
        colVector = np.clip(colVector, -500, 500)
        return 1/(1 + np.exp(-colVector))

    def dSigmoid(self, colVector):
        return (self.sigmoid(colVector)) * (1 - self.sigmoid(colVector))

    def feedForward(self, x, dataset):
        if dataset == "training":
            # --- FIX 2: DO NOT SIGMOID INPUT ---
            # The data is already normalized 0-1. Just take it as is.
            self.a_0 = self.data[x, :].reshape(784,1)

        if dataset == "testing":
            # --- FIX 2: DO NOT SIGMOID INPUT ---
            self.a_0 = self.testData[x, :].reshape(784,1)

        #Going into Layer 1
        self.z_1 = (np.dot(self.W_1, self.a_0)) + self.b_1
        self.a_1 = self.sigmoid(self.z_1)

        #Going into Layer 2
        self.z_2 = (np.dot(self.W_2, self.a_1)) + self.b_2
        self.a_2 = self.sigmoid(self.z_2)

        #Going into Layer 3 (output layer)
        self.z_3 = (np.dot(self.W_3, self.a_2)) + self.b_3
        self.a_3 = self.sigmoid(self.z_3)

    def backProp(self, x): 
        self.y[self.labels[x, 0], 0] = 1
        
        self.error_out = (self.a_3 - self.y) * self.dSigmoid(self.z_3)

        self.error_2 = np.dot((np.transpose(self.W_3)), self.error_out) * self.dSigmoid(self.z_2)
        self.error_1 = np.dot((np.transpose(self.W_2)), self.error_2) * self.dSigmoid(self.z_1)

        self.y = np.zeros([10,1]).astype(int)

    def accumulateGradients(self):
        self.dW_1 += (np.dot(self.error_1, np.transpose(self.a_0)))
        self.dB_1 += self.error_1
        
        self.dW_2 += (np.dot(self.error_2, np.transpose(self.a_1)))
        self.dB_2 += self.error_2

        self.dW_3 += (np.dot(self.error_out, np.transpose(self.a_2)))
        self.dB_3 += self.error_out

    def applyAvgGradient(self):
        n = self.learningRate
        m = self.batchSize
        
        self.W_1 -= ((n/m)*self.dW_1)
        self.b_1 -= ((n/m)*self.dB_1)

        self.W_2 -= ((n/m)*self.dW_2)
        self.b_2 -= ((n/m)*self.dB_2)

        self.W_3 -= ((n/m)*self.dW_3)
        self.b_3 -= ((n/m)*self.dB_3)

        # --- FIX 3: RESET GRADIENTS ---
        # If you don't do this, gradients pile up forever -> infinity -> NaN
        self.dW_1.fill(0)
        self.dB_1.fill(0)
        self.dW_2.fill(0)
        self.dB_2.fill(0)
        self.dW_3.fill(0)
        self.dB_3.fill(0)

    def startTraining(self):
        for epochs in range(self.epochs):
            for batch in range(int(self.trainingSetSize/self.batchSize)): 
                for x in range(self.batchSize):
                    
                    # --- FIX 4: CORRECT INDEXING ---
                    # Calculate the actual index in the dataset
                    data_index = (batch * self.batchSize) + x
                    
                    self.feedForward(data_index, "training")
                    self.backProp(data_index)
                    self.accumulateGradients()
                    
                self.applyAvgGradient() 
            
            # Print accuracy on a small subset to check progress
            print(f"Finished epoch {epochs}")
     
    def evaluate(self):
        correct = 0
        for x in range(10000): 
            self.feedForward(x, "testing")
            if (np.argmax(self.a_3) == self.testLabels[x, 0]):
                correct += 1 
        return (correct/10000)*100 

# Run it
nn = Neural_Network(epochs=10, learningRate=0.5, batchSize=1) # Increased LR, decreased Batch size for faster updates
nn.startTraining()
print("Accuracy of Model:", nn.evaluate(), "%")

Finished epoch 0
Finished epoch 1
Finished epoch 2
Finished epoch 3
Finished epoch 4
Finished epoch 5
Finished epoch 6
Finished epoch 7
Finished epoch 8
Finished epoch 9
Accuracy of Model: 93.27 %


### Conclusion:
Very frustrating but still very fun project. There are still many things I am trying to learn more about regarding this network. For instance, why is it that when I increase the batch size, the accuracy seems to decrease?