<a href="https://colab.research.google.com/github/mkhi238/NumPy-Neural-Network/blob/main/Numpy_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import kagglehub
path = kagglehub.dataset_download("hojjatk/mnist-dataset")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mnist-dataset


In [None]:
train_df = pd.read_csv("/content/sample_data/mnist_train_small.csv")
test_df = pd.read_csv("/content/sample_data/mnist_test.csv")

In [None]:
train_df.rename(columns={"6": "label"}, inplace=True)
test_df.rename(columns={"7": "label"}, inplace=True)

In [None]:
X_train = (train_df.drop("label", axis=1).values).T
y_train = (train_df["label"].values).T
X_test = (test_df.drop("label", axis=1).values).T
y_test = (test_df["label"].values).T

In [None]:
X_train[:,0].shape

(784,)

Version 1: SGD With He (Kaiming) initalization, LeakyReLu

In [None]:
import numpy as np

#He (Kaiming) Initalizaiton
def init_params():
    W1 = np.random.randn(155, 784) * np.sqrt(2.0 / 784)
    B1 = np.zeros((155, 1))
    W2 = np.random.randn(10, 155) * np.sqrt(2.0 / 155)
    B2 = np.zeros((10, 1))
    return W1, B1, W2, B2

#LeakyReLu
def LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, Z, alpha * Z)

def deriv_LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, 1, alpha)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward1(X, W1, B1, W2, B2):
    Z1 = W1 @ X + B1
    A1 = LeakyReLU(Z1)
    Z2 = W2 @ A1 + B2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def one_hot(Y):
    one_hot = np.zeros((Y.size, Y.max() + 1))
    one_hot[np.arange(Y.size), Y] = 1
    return one_hot.T

def backprop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y)

    dZ2 = A2 - one_hot_Y
    dW2 = 1/m * dZ2 @ A1.T
    dB2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)

    dZ1 = W2.T @ dZ2 * deriv_LeakyReLU(Z1)
    dW1 = 1/m * dZ1 @ X.T
    dB1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, dB1, dW2, dB2

def update_params(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha):
    W1 -= alpha * dW1
    B1 -= alpha * dB1
    W2 -= alpha * dW2
    B2 -= alpha * dB2
    return W1, B1, W2, B2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, Y):
    return np.mean(predictions == Y)

def gradient_descent(X, Y, alpha, iterations):
    W1, B1, W2, B2 = init_params()
    training_accuracy = []
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward1(X, W1, B1, W2, B2)
        dW1, dB1, dW2, dB2 = backprop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, B1, W2, B2 = update_params(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha)
        if i % 10 == 0:
            predictions = get_predictions(A2)
            print("Iteration:", i)
            print("Accuracy:", get_accuracy(predictions, Y))
    return W1, B1, W2, B2



In [None]:
W1, B1, W2, B2 = gradient_descent(X_train, y_train, 0.01, 600)


Iteration: 0
Accuracy: 0.0823041152057603
Iteration: 10
Accuracy: 0.10105505275263763
Iteration: 20
Accuracy: 0.09810490524526226
Iteration: 30
Accuracy: 0.027001350067503375
Iteration: 40
Accuracy: 0.19620981049052452
Iteration: 50
Accuracy: 0.2710635531776589
Iteration: 60
Accuracy: 0.21336066803340167
Iteration: 70
Accuracy: 0.31496574828741436
Iteration: 80
Accuracy: 0.28311415570778536
Iteration: 90
Accuracy: 0.37131856592829643
Iteration: 100
Accuracy: 0.3183659182959148
Iteration: 110
Accuracy: 0.4147207360368018
Iteration: 120
Accuracy: 0.391619580979049
Iteration: 130
Accuracy: 0.39476973848692437
Iteration: 140
Accuracy: 0.47677383869193457
Iteration: 150
Accuracy: 0.5021251062553128
Iteration: 160
Accuracy: 0.5110255512775639
Iteration: 170
Accuracy: 0.49712485624281216
Iteration: 180
Accuracy: 0.4430221511075554
Iteration: 190
Accuracy: 0.6469823491174559
Iteration: 200
Accuracy: 0.5688784439221961
Iteration: 210
Accuracy: 0.7072853642682134
Iteration: 220
Accuracy: 0.62288

In [None]:
_, _, _, A1_test = forward1(X_test, W1, B1, W2, B2)
test_preds = get_predictions(A1_test)
test_acc = get_accuracy(test_preds, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Test accuracy: 0.8802


Version 1: SGD With He (Kaiming) initalization, LeakyReLu, Mini-batches, and AdAM optimizer

In [None]:
import numpy as np

#He (Kaiming) Initalizaiton
def init_params():
    W1 = np.random.randn(155, 784) * np.sqrt(2.0 / 784)
    B1 = np.zeros((155, 1))
    W2 = np.random.randn(10, 155) * np.sqrt(2.0 / 155)
    B2 = np.zeros((10, 1))
    return W1, B1, W2, B2

#LeakyReLu
def LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, Z, alpha * Z)

def deriv_LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, 1, alpha)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def forward2(X, W1, B1, W2, B2):
    Z1 = W1 @ X + B1
    A1 = LeakyReLU(Z1)
    Z2 = W2 @ A1 + B2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def one_hot(Y, num_classes):
    oh = np.zeros((num_classes, Y.size), dtype=np.float32)
    oh[Y, np.arange(Y.size)] = 1.0
    return oh

def backprop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y, num_classes=A2.shape[0])

    dZ2 = A2 - one_hot_Y
    dW2 = 1/m * dZ2 @ A1.T
    dB2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)

    dZ1 = W2.T @ dZ2 * deriv_LeakyReLU(Z1)
    dW1 = 1/m * dZ1 @ X.T
    dB1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, dB1, dW2, dB2

def update_params(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha):
    W1 -= alpha * dW1
    B1 -= alpha * dB1
    W2 -= alpha * dW2
    B2 -= alpha * dB2
    return W1, B1, W2, B2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, Y):
    return np.mean(predictions == Y)

def iter_minibatch_generator(X, Y, batchsize = 256, shuffle = True):
  m = X.shape[1]
  idx = np.arange(m)
  if shuffle:
    np.random.shuffle(idx)

  for i in range(0, m, batchsize):
    sel = idx[i:i+batchsize]
    yield X[:, sel], Y[sel]


def gradient_descent_adam(X, Y, alpha=1e-3, iterations=200, batch_size=256, beta1=0.9, beta2=0.999, e=1e-8):
    W1, B1, W2, B2 = init_params()
    mW1 = np.zeros_like(W1)
    vW1 = np.zeros_like(W1)
    mB1 = np.zeros_like(B1)
    vB1 = np.zeros_like(B1)
    mW2 = np.zeros_like(W2)
    vW2 = np.zeros_like(W2)
    mB2 = np.zeros_like(B2)
    vB2 = np.zeros_like(B2)
    t = 0

    def adam_step(param, grad, m, v, t):
      m = beta1 * m + (1 - beta1) * grad
      v = beta2 * v + (1 - beta2) * (grad * grad)
      m_hat = m / (1 - beta1 ** t)
      v_hat = v / (1 - beta2 ** t)
      param = param - alpha * m_hat / (np.sqrt(v_hat) + e)
      return param, m, v


    for i in range(iterations):

        for X_batch, Y_batch in iter_minibatch_generator(X, Y, batch_size):
          Z1, A1, Z2, A2 = forward2(X_batch, W1, B1, W2, B2)
          dW1, dB1, dW2, dB2 = backprop(Z1, A1, Z2, A2, W1, W2, X_batch, Y_batch)
          t += 1
          W1, mW1, vW1 = adam_step(W1, dW1, mW1, vW1, t)
          B1, mB1, vB1 = adam_step(B1, dB1, mB1, vB1, t)
          W2, mW2, vW2 = adam_step(W2, dW2, mW2, vW2, t)
          B2, mB2, vB2 = adam_step(B2, dB2, mB2, vB2, t)


        _, _, _, A2_full = forward2(X, W1, B1, W2, B2)
        preds = get_predictions(A2_full)
        acc = get_accuracy(preds, Y)
        if i % 10 == 0:
          print(f"Epoch {i:02d} | Accuracy: {acc:.4f}")


    return W1, B1, W2, B2



In [None]:
W1, B1, W2, B2 = gradient_descent_adam(X_train, y_train, alpha=0.01, iterations=100, batch_size=64, beta1=0.9, beta2=0.999, e=1e-8)

Epoch 00 | Accuracy: 0.8543
Epoch 10 | Accuracy: 0.8859
Epoch 20 | Accuracy: 0.8878
Epoch 30 | Accuracy: 0.9318
Epoch 40 | Accuracy: 0.9345
Epoch 50 | Accuracy: 0.9334
Epoch 60 | Accuracy: 0.9574
Epoch 70 | Accuracy: 0.9540
Epoch 80 | Accuracy: 0.9665
Epoch 90 | Accuracy: 0.9689


In [None]:
_, _, _, A2_test = forward2(X_test, W1, B1, W2, B2)
test_preds = get_predictions(A2_test)
test_acc = get_accuracy(test_preds, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Test accuracy: 0.9165
