## Classificaiton of MNIST [numpy]

In [1]:
import numpy as np

### Data

- Train images: t10k-images-idx3-ubyte.gz
- Train labels: t10k-labels-idx1-ubyte.gz
- Test images: train-images-idx3-ubyte.gz
- Test labels: train-labels-idx1-ubyte.gz

In [2]:
import os
import gzip

def load_mnist_images(data_dir, filename):
    data_path = os.path.join(data_dir, filename)
    with gzip.open(data_path, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    return data.reshape(-1, 28, 28)

def load_mnist_labels(data_dir, filename):
    data_path = os.path.join(data_dir, filename)
    with gzip.open(data_path, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=8)
    return data

data_dir = "/mnt/d/datasets/fashion_mnist_29M/"
x_train = load_mnist_images(data_dir, "train-images-idx3-ubyte.gz")
y_train = load_mnist_labels(data_dir, "train-labels-idx1-ubyte.gz")
x_test = load_mnist_images(data_dir, "t10k-images-idx3-ubyte.gz")
y_test = load_mnist_labels(data_dir, "t10k-labels-idx1-ubyte.gz")

print(f">> Train images: {x_train.shape}, {x_train.dtype}")
print(f">> Train labels: {y_train.shape}, {y_train.dtype}")
print(f">> Test images:  {x_test.shape}, {x_test.dtype}")
print(f">> Test labels:  {y_test.shape}, {y_test.dtype}")

>> Train images: (60000, 28, 28), uint8
>> Train labels: (60000,), uint8
>> Test images:  (10000, 28, 28), uint8
>> Test labels:  (10000,), uint8


### Preprocessing

In [3]:
def one_hot(y, n_classes):
    return np.eye(n_classes)[y]

x_train_scaled = x_train.astype(np.float32).reshape(-1, 28*28) / 255
x_test_scaled = x_test.astype(np.float32).reshape(-1, 28*28) / 255

y_train_onehot = one_hot(y_train, n_classes=10).astype(np.int64)
y_test_onehot = one_hot(y_test, n_classes=10).astype(np.int64)

print(f">> Train images: {x_train_scaled.shape}, {x_train_scaled.dtype}")
print(f">> Train labels: {y_train_onehot.shape}, {y_train_onehot.dtype}")
print(f">> Test images:  {x_test_scaled.shape}, {x_test_scaled.dtype}")
print(f">> Test labels:  {y_test_onehot.shape}, {y_test_onehot.dtype}")

>> Train images: (60000, 784), float32
>> Train labels: (60000, 10), int64
>> Test images:  (10000, 784), float32
>> Test labels:  (10000, 10), int64


### Modeling

In [4]:
## Model: 2-layer MLP
np.random.seed(42)
input_size, hidden_size, output_size = 28*28, 256, 10

w1 = np.random.randn(input_size, hidden_size)   # weight of 1st layer
b1 = np.zeros(hidden_size)                      # bias of 1st layer
w2 = np.random.randn(hidden_size, output_size)  # weight of 2nd layer
b2 = np.zeros(output_size)                      # bias of 2nd layer

### Training

In [5]:
from scipy.special import expit as sigmoid

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)

def cross_entropy(y_pred, y_true):
    batch_size = y_pred.shape[0] if y_pred.ndim == 2 else 1
    return -np.sum(y_true*np.log(y_pred + 1.0E-8))/batch_size

def accuracy(y_pred, y_true):
    return (y_pred.argmax(axis=1) == y_true.argmax(axis=1)).mean()

In [6]:
n_epochs = 500
learning_rate = 0.01

x_train = x_train_scaled
y_train = y_train_onehot

batch_size = 32
for epoch in range(1, n_epochs + 1):
    batch_loss = 0
    batch_acc = 0
    indices = np.random.permutation(len(x_train))
    for i in range(len(x_train) // batch_size):
        x = x_train[indices[i*batch_size: (i+1)*batch_size]]
        y = y_train[indices[i*batch_size: (i+1)*batch_size]]
        
        # Forward propagation
        z1 = np.dot(x, w1) + b1
        a1 = sigmoid(z1)
        z2 = np.dot(a1, w2) + b2
        out = softmax(z2)

        loss = cross_entropy(out, y)
        acc = accuracy(out, y)

        # Backward propagation
        grad_z2 = (z2 - y) / y.shape[0]
        grad_w2 = np.dot(a1.T, grad_z2)
        grad_b2 = np.sum(grad_z2, axis=0)

        grad_a1 = np.dot(grad_z2, w2.T)
        grad_z1 = a1 * (1 - a1) * grad_a1
        grad_w1 = np.dot(x.T, grad_z1)
        grad_b1 = np.sum(grad_z1, axis=0)

        # Update weights and biases
        w1 -= learning_rate * grad_w1
        b1 -= learning_rate * grad_b1
        w2 -= learning_rate * grad_w2
        b2 -= learning_rate * grad_b2
        
        batch_loss += loss
        batch_acc += acc

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {batch_loss/(i+1):.3f} acc: {batch_acc/(i+1):.3f}")

[50/500] loss: 1.822 acc: 0.738
[100/500] loss: 1.790 acc: 0.774
[150/500] loss: 1.772 acc: 0.788
[200/500] loss: 1.758 acc: 0.797
[250/500] loss: 1.748 acc: 0.803
[300/500] loss: 1.740 acc: 0.808
[350/500] loss: 1.733 acc: 0.813


KeyboardInterrupt: 

In [7]:
x_test = x_test_scaled
y_test = y_test_onehot

# Forward propagation
z1 = np.dot(x_test, w1) + b1
a1 = sigmoid(z1)
z2 = np.dot(a1, w2) + b2
y_pred = softmax(z2)

loss = cross_entropy(y_pred, y_test)
acc = accuracy(y_pred, y_test)

print(f"loss: {loss:.3f} acc: {acc:.3f}")

loss: 1.739 acc: 0.800
