In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('/content/sample_data/mnist_train_small.csv')
data_test_val = pd.read_csv('/content/sample_data/mnist_test.csv')
data.shape

(19999, 785)

In [3]:
data.head()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data = np.array(data)

In [5]:
np.random.shuffle(data)

In [6]:
m, n = data.shape

In [7]:
print(m, n)

19999 785


In [8]:
data_val = data[0:1000].T
Y_val = data_val[0]
X_val = data_val[1:n]
X_val = X_val / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

In [9]:
def initial_param():
    W1 = np.random.rand(20, 784) - 0.5  # First hidden layer with 20 neurons
    b1 = np.random.rand(20, 1) - 0.5

    W2 = np.random.rand(15, 20) - 0.5  # Second hidden layer with 15 neurons
    b2 = np.random.rand(15, 1) - 0.5

    W3 = np.random.rand(10, 15) - 0.5  # Output layer (10 classes, connected to 15 neurons)
    b3 = np.random.rand(10, 1) - 0.5

    return W1, b1, W2, b2, W3, b3


In [10]:
def ReLU_Function(Z):
    return np.maximum(0, Z)

In [11]:
def softmax_func(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

In [12]:
def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU_Function(Z1)

    Z2 = W2.dot(A1) + b2
    A2 = ReLU_Function(Z2)

    Z3 = W3.dot(A2) + b3
    A3 = softmax_func(Z3)

    return Z1, A1, Z2, A2, Z3, A3

In [13]:
def one_hot(Y):
    hot_y = np.zeros((Y.size, Y.max() + 1))
    hot_y[np.arange(Y.size), Y] = 1
    hot_y = hot_y.T
    return hot_y

In [14]:
def derivative_ReLU(Z):
    return Z > 0

In [15]:
def back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
    m = Y.size
    hot_y = one_hot(Y)

    # Gradients for Layer 3 (Output Layer)
    dz3 = A3 - hot_y  # Error at the output layer
    dw3 = 1 / m * dz3.dot(A2.T)  # Gradient of W3
    db3 = 1 / m * np.sum(dz3, axis=1, keepdims=True)  # Gradient of b3

    # Gradients for Layer 2
    dz2 = W3.T.dot(dz3) * derivative_ReLU(Z2)
    dw2 = 1 / m * dz2.dot(A1.T)
    db2 = 1 / m * np.sum(dz2, axis=1, keepdims=True)

    # Gradients for Layer 1
    dz1 = W2.T.dot(dz2) * derivative_ReLU(Z1)
    dw1 = 1 / m * dz1.dot(X.T)
    db1 = 1 / m * np.sum(dz1, axis=1, keepdims=True)

    return dw1, db1, dw2, db2, dw3, db3


In [16]:
def get_pred(A3):
    return np.argmax(A3, 0)

In [17]:
def get_acc(pred, Y):
    print(pred, Y)
    return np.sum(pred == Y) / Y.size

In [18]:
def initialize_adam(parameters):
    adam = {}
    for key in parameters:
        adam[f"m_{key}"] = np.zeros_like(parameters[key])
        adam[f"v_{key}"] = np.zeros_like(parameters[key])
    return adam

In [19]:
def adam_update(parameters, gradients, adam, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    for key in parameters:
        # Update first moment
        adam[f"m_{key}"] = beta1 * adam[f"m_{key}"] + (1 - beta1) * gradients[key]

        # Update second moment
        adam[f"v_{key}"] = beta2 * adam[f"v_{key}"] + (1 - beta2) * (gradients[key] ** 2)

        # Parameter update
        parameters[key] -= alpha * adam[f"m_{key}"] / (np.sqrt(adam[f"v_{key}"]) + epsilon)

    return parameters, adam


In [20]:
def train_nn_adam(X, Y, epochs, alpha=0.001):
    W1, b1, W2, b2, W3, b3 = initial_param()
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3}

    adam = initialize_adam(parameters)

    for t in range(epochs):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)

        dw1, db1, dw2, db2, dw3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
        gradients = {"W1": dw1, "b1": db1, "W2": dw2, "b2": db2, "W3": dw3, "b3": db3}

        parameters, adam = adam_update(parameters, gradients, adam, alpha)

        W1, b1 = parameters["W1"], parameters["b1"]
        W2, b2 = parameters["W2"], parameters["b2"]
        W3, b3 = parameters["W3"], parameters["b3"]

        if t % 10 == 0:
            print(f"Iteration {t}:")
            print(f"Accuracy: {get_acc(get_pred(A3), Y)}")

    return W1, b1, W2, b2, W3, b3


In [22]:
# Train
W1, b1, W2, b2, W3, b3 = train_nn_adam(X_train, Y_train, epochs=1500, alpha=0.001)


Iteration 0:
[2 6 2 ... 6 6 2] [1 1 5 ... 7 0 1]
Accuracy: 0.07263540186325596
Iteration 10:
[2 2 9 ... 7 0 2] [1 1 5 ... 7 0 1]
Accuracy: 0.257539870519501
Iteration 20:
[1 1 9 ... 9 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.5292910153165956
Iteration 30:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.6954576556660876
Iteration 40:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.7675667140375809
Iteration 50:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8071477446181378
Iteration 60:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8365177114584978
Iteration 70:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8567819358913628
Iteration 80:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8680983209642613
Iteration 90:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8769408916258751
Iteration 100:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8839412600663192
Iteration 110:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]
Accuracy: 0.8885204484446549
Iteration 120:
[1 1 9 ... 7 0 1] [1 1 5 ... 7 0 1]


In [23]:
#Test
data_test_val = np.array(data_test_val)

X_test_val = data_test_val[:, 1:].T / 255.0
Y_test_val = data_test_val[:, 0]

def test_model(W1, b1, W2, b2, W3, b3, X_test, Y_test):
    Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X_test_val)

    predictions = get_pred(A3)

    accuracy = get_acc(predictions, Y_test_val)
    return accuracy

test_accuracy = test_model(W1, b1, W2, b2, W3, b3, X_test_val, Y_test_val)
print(f"Test Accuracy: {test_accuracy:.4f}%")

[2 1 0 ... 4 5 6] [2 1 0 ... 4 5 6]
Test Accuracy: 0.9279%


Gradient Descent

In [None]:
def update_func(W1, b1, W2, b2, W3, b3, dw1, db1, dw2, db2, dw3, db3, alpha):
    W1 = W1 - alpha * dw1
    b1 = b1 - alpha * db1

    W2 = W2 - alpha * dw2
    b2 = b2 - alpha * db2

    W3 = W3 - alpha * dw3
    b3 = b3 - alpha * db3

    return W1, b1, W2, b2, W3, b3


In [None]:
def grad_desc(X, Y, epochs, alpha):
    # Initialize weights and biases for 3 layers
    W1, b1, W2, b2, W3, b3 = initial_param()

    for i in range(epochs):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)

        dw1, db1, dw2, db2, dw3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)

        W1, b1, W2, b2, W3, b3 = update_func(W1, b1, W2, b2, W3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)

        if i % 10 == 0:
            print('Iteration #: ', i)
            print('Accuracy: ', get_acc(get_pred(A3), Y))

    return W1, b1, W2, b2, W3, b3

In [None]:
# Train
W1, b1, W2, b2, W3, b3 = grad_desc(X_train, Y_train, epochs=1500, alpha=0.13)

Iteration #:  0
[1 6 3 ... 6 3 6] [1 9 6 ... 2 8 7]
Accuracy:  0.13895468182535922
