In [None]:
import numpy as np
from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images / 255.0
test_images = test_images / 255.0

train_images = train_images.reshape((-1, 28*28))
test_images = test_images.reshape((-1, 28*28))

train_labels = np.eye(10)[train_labels]
test_labels = np.eye(10)[test_labels]

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def softmax(x):
    exp_values = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_values / np.sum(exp_values, axis=1, keepdims=True)

input_size = 784  # 28x28 pixels
hidden_size = 64
output_size = 10  # digits 0-9

weights_input_hidden = np.random.randn(input_size, hidden_size)
biases_hidden = np.zeros((1, hidden_size))
weights_hidden_output = np.random.randn(hidden_size, output_size)
biases_output = np.zeros((1, output_size))

def forward_pass(x):
    hidden_layer_activation = np.dot(x, weights_input_hidden) + biases_hidden
    hidden_layer_output = sigmoid(hidden_layer_activation)
    
    output_layer_activation = np.dot(hidden_layer_output, weights_hidden_output) + biases_output
    output = softmax(output_layer_activation)
    
    return hidden_layer_output, output

def backward_pass(x, y, hidden_layer_output, output, learning_rate):
    output_error = output - y
    hidden_layer_error = np.dot(output_error, weights_hidden_output.T) * sigmoid_derivative(hidden_layer_output)
    
    weights_hidden_output_gradient = np.dot(hidden_layer_output.T, output_error)
    biases_output_gradient = np.sum(output_error, axis=0, keepdims=True)
    
    weights_input_hidden_gradient = np.dot(x.T, hidden_layer_error)
    biases_hidden_gradient = np.sum(hidden_layer_error, axis=0, keepdims=True)
    
    # Update weights and biases
    global weights_hidden_output, biases_output, weights_input_hidden, biases_hidden
    weights_hidden_output -= learning_rate * weights_hidden_output_gradient
    biases_output -= learning_rate * biases_output_gradient
    weights_input_hidden -= learning_rate * weights_input_hidden_gradient
    biases_hidden -= learning_rate * biases_hidden_gradient

epochs = 10
learning_rate = 0.1
batch_size = 32

for epoch in range(epochs):
    for i in range(0, len(train_images), batch_size):
        x_batch = train_images[i:i+batch_size]
        y_batch = train_labels[i:i+batch_size]
        
        hidden_layer_output, output = forward_pass(x_batch)
        backward_pass(x_batch, y_batch, hidden_layer_output, output, learning_rate)
    
    
    _, train_output = forward_pass(train_images)
    train_loss = -np.mean(np.sum(train_labels * np.log(train_output + 1e-8), axis=1))
    train_accuracy = np.mean(np.argmax(train_output, axis=1) == np.argmax(train_labels, axis=1))
    
    print(f"Epoch {epoch + 1}, Loss: {train_loss}, Accuracy: {train_accuracy * 100}%")

_, test_output = forward_pass(test_images)
test_accuracy = np.mean(np.argmax(test_output, axis=1) == np.argmax(test_labels, axis=1))

print(f"Test Accuracy: {test_accuracy * 100}%")