In [1]:
import numpy as np

In [2]:
with np.load('./mnist.npz') as data:
    X_train = data['x_train']
    y_train = data['y_train']
    X_test = data['x_test']
    y_test = data['y_test']

In [3]:
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

In [4]:
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

In [5]:
def initialize_weights(input_size, hidden_size, output_size):
    W1 = np.random.randn(input_size, hidden_size) * 0.01
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * 0.01
    b2 = np.zeros((1, output_size))
    return W1, b1, W2, b2

In [6]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

In [7]:
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [8]:
def backward_propagation(X, Y, Z1, A1, Z2, A2, W1, W2):
    m = X.shape[0]
    
    dZ2 = A2 - Y
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * (Z1 > 0)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2

In [9]:
def update_weights(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

In [10]:
def train(X_train, y_train, hidden_size, learning_rate, epochs):
    input_size = X_train.shape[1]
    output_size = y_train.shape[1]
    
    W1, b1, W2, b2 = initialize_weights(input_size, hidden_size, output_size)
    
    for epoch in range(epochs):
        Z1, A1, Z2, A2 = forward_propagation(X_train, W1, b1, W2, b2)
        dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, Z1, A1, Z2, A2, W1, W2)
        W1, b1, W2, b2 = update_weights(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
        
        if epoch % 10 == 0:
            loss = -np.mean(y_train * np.log(A2))
            print(f'Epoch {epoch}, Loss: {loss}')
    
    return W1, b1, W2, b2

In [12]:
def predict(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_propagation(X, W1, b1, W2, b2)
    return np.argmax(A2, axis=1)

In [13]:
def accuracy(y_pred, y_true):
    return np.mean(y_pred == y_true)

In [11]:
W1, b1, W2, b2 = train(X_train, y_train, hidden_size=128, learning_rate=0.1, epochs=100)

y_pred = predict(X_test, W1, b1, W2, b2)
test_accuracy = accuracy(y_pred, np.argmax(y_test, axis=1))
print(f'Test Accuracy: {test_accuracy}')

Epoch 0, Loss: 0.23024881665440888
Epoch 10, Loss: 0.22904900167344633
Epoch 20, Loss: 0.22683208683754652
Epoch 30, Loss: 0.22215769752625814
Epoch 40, Loss: 0.2128087869723757
Epoch 50, Loss: 0.19645404215756507
Epoch 60, Loss: 0.17293371428189472
Epoch 70, Loss: 0.14665729417783
Epoch 80, Loss: 0.12373508572960028
Epoch 90, Loss: 0.10621010290352065
Test Accuracy: 0.7972
