# Neural Network to train MNIST dataset from scratch with numpy

#### Import required packages

In [None]:
import numpy as np
import idx2numpy

In [None]:
rel_path_train_images = 'MNIST_data/train-images-idx3-ubyte'
rel_path_train_labels = 'MNIST_data/train-labels-idx1-ubyte'
rel_path_test_images = 'MNIST_data/t10k-images-idx3-ubyte'
rel_path_test_labels = 'MNIST_data/t10k-labels-idx1-ubyte'

##### Function to get one-hot encoded labels

In [None]:
def one_hot_encode(arr):
    arr = arr.flatten()
    num_classes = np.max(arr) + 1
    one_hot = np.eye(num_classes)[arr]
    return one_hot

#### Load Training and Test Data

In [25]:
X_train = idx2numpy.convert_from_file(rel_path_train_images)
X_train = X_train.reshape((60000, 28*28))    
X_train = X_train/255

Y_train = idx2numpy.convert_from_file(rel_path_train_labels)
Y_train = one_hot_encode(Y_train)

X_test = idx2numpy.convert_from_file(rel_path_test_images)
X_test = X_test.reshape((10000, 28*28))    
X_test = X_test/255

Y_test = idx2numpy.convert_from_file(rel_path_test_labels)
Y_test = one_hot_encode(Y_test)

#### Define hyperparameters

In [None]:
input_size = 784
hidden_size = 128
output_size = 10
batch_size = 64
epochs = 10
learning_rate = 0.1

#### Defining Activation and Loss Functions

In [26]:
def relu(input):
        output = np.maximum(0, input)
        return output

def grad_relu(input):
    grad = np.where(input > 0, 1, 0)
    return grad

def softmax(inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        output = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        return output

def ce_loss(y_pred, y_true):
    loss = -np.mean(np.sum(y_true * np.log(y_pred + 1e-15), axis = 1))
    return loss

#### Initialize weights and Train the model

In [27]:
W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
B1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
B2 = np.zeros((1, output_size))


for epoch in range(epochs):
    epoch_loss = 0.0
    num_batches = 0

    for i in range(0, X_train.shape[0], batch_size):
        x = X_train[i:i+batch_size]
        y = Y_train[i:i+batch_size]
    
        z1 = np.dot(x, W1) + B1
        a1 = relu(z1)
        z2 = np.dot(a1, W2) + B2
        a2 = softmax(z2)
    
        batch_loss = ce_loss(a2, y)
        
        epoch_loss += batch_loss
        num_batches += 1
    
        dz2 = (a2 - y)
        dW2 = (1 / batch_size) * np.dot(a1.T, dz2)
        dB2 = (1 / batch_size) * np.sum(dz2, axis=0, keepdims=True)
    
        da1 = np.dot(dz2, W2.T)
        dz1 = da1 * grad_relu(z1)
        dW1 = (1 / batch_size) * np.dot(x.T, dz1)
        dB1 = (1 / batch_size) * np.sum(dz1, axis=0, keepdims=True)
    
        W2 -= learning_rate * dW2
        B2 -= learning_rate * dB2
        W1 -= learning_rate * dW1
        B1 -= learning_rate * dB1
    
    
    z1 = np.dot(X_train, W1) + B1
    a1 = relu(z1)
    z2 = np.dot(a1, W2) + B2
    a2 = softmax(z2)
    predictions = np.argmax(a2, axis=1)
    targets = np.argmax(Y_train, axis=1)
    accuracy = np.mean(predictions == targets)

    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/X_train.shape[0]:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1, Loss: 0.0057, Accuracy: 0.9292
Epoch 2, Loss: 0.0031, Accuracy: 0.9498
Epoch 3, Loss: 0.0023, Accuracy: 0.9612
Epoch 4, Loss: 0.0019, Accuracy: 0.9686
Epoch 5, Loss: 0.0016, Accuracy: 0.9730
Epoch 6, Loss: 0.0014, Accuracy: 0.9764
Epoch 7, Loss: 0.0012, Accuracy: 0.9791
Epoch 8, Loss: 0.0011, Accuracy: 0.9815
Epoch 9, Loss: 0.0010, Accuracy: 0.9832
Epoch 10, Loss: 0.0009, Accuracy: 0.9845


#### Evaluate model on test data

In [28]:
z1_test = np.dot(X_test, W1) + B1
a1_test = relu(z1_test)
z2_test = np.dot(a1_test, W2) + B2
a2_test = softmax(z2_test)

predictions_test = np.argmax(a2_test, axis=1)
targets_test = np.argmax(Y_test, axis=1)
test_acc = np.mean(predictions_test == targets_test)
print(f"Epoch {epoch+1} Test Accuracy: {test_acc:.4f}")

Epoch 10 Test Accuracy: 0.9742
