## Classificaiton of MNIST [numpy]

In [1]:
import numpy as np

### Data

- Train images: t10k-images-idx3-ubyte.gz
- Train labels: t10k-labels-idx1-ubyte.gz
- Test images: train-images-idx3-ubyte.gz
- Test labels: train-labels-idx1-ubyte.gz

In [2]:
import os
import gzip

def load_mnist_images(data_dir, filename):
    data_path = os.path.join(data_dir, filename)
    with gzip.open(data_path, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    return data.reshape(-1, 28, 28)

def load_mnist_labels(data_dir, filename):
    data_path = os.path.join(data_dir, filename)
    with gzip.open(data_path, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=8)
    return data

data_dir = "/mnt/d/datasets/fashion_mnist_29M/"
x_train = load_mnist_images(data_dir, "train-images-idx3-ubyte.gz")
y_train = load_mnist_labels(data_dir, "train-labels-idx1-ubyte.gz")
x_test = load_mnist_images(data_dir, "t10k-images-idx3-ubyte.gz")
y_test = load_mnist_labels(data_dir, "t10k-labels-idx1-ubyte.gz")

print(f">> Train images: {x_train.shape}, {x_train.dtype}")
print(f">> Train labels: {y_train.shape}, {y_train.dtype}")
print(f">> Test images:  {x_test.shape}, {x_test.dtype}")
print(f">> Test labels:  {y_test.shape}, {y_test.dtype}")

>> Train images: (60000, 28, 28), uint8
>> Train labels: (60000,), uint8
>> Test images:  (10000, 28, 28), uint8
>> Test labels:  (10000,), uint8


### Preprocessing

In [3]:
def one_hot(y, n_classes):
    return np.eye(n_classes)[y]

x_train_scaled = x_train.astype(np.float32).reshape(-1, 28*28) / 255
x_test_scaled = x_test.astype(np.float32).reshape(-1, 28*28) / 255

y_train_onehot = one_hot(y_train, n_classes=10).astype(np.int64)
y_test_onehot = one_hot(y_test, n_classes=10).astype(np.int64)

print(f">> Train images: {x_train_scaled.shape}, {x_train_scaled.dtype}")
print(f">> Train labels: {y_train_onehot.shape}, {y_train_onehot.dtype}")
print(f">> Test images:  {x_test_scaled.shape}, {x_test_scaled.dtype}")
print(f">> Test labels:  {y_test_onehot.shape}, {y_test_onehot.dtype}")

>> Train images: (60000, 784), float32
>> Train labels: (60000, 10), int64
>> Test images:  (10000, 784), float32
>> Test labels:  (10000, 10), int64


### Modeling

In [7]:
import torch

## Model: 2-layer MLP
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size, hidden_size, output_size = 28*28, 256, 10

w1 = torch.randn(input_size, hidden_size).to(device)
b1 = torch.zeros(hidden_size).to(device)
w2 = torch.randn(hidden_size, output_size).to(device)
b2 = torch.zeros(output_size).to(device)

### Training

In [None]:
def accuracy(y_pred, y_true):
    y_pred = y_pred.argmax(dim=1)
    y_true = y_true.argmax(dim=1)
    return torch.eq(y_pred, y_true).float().mean()

In [12]:
n_epochs = 100
learning_rate = 0.01

x_train = torch.tensor(x_train_scaled).float().to(device)
y_train = torch.tensor(y_train_onehot).float().to(device)

batch_size = 32
for epoch in range(1, n_epochs + 1):
    batch_loss = 0
    batch_acc = 0
    indices = torch.randperm(len(x_train))
    for i in range(len(x_train) // batch_size):
        x = x_train[indices[i*batch_size: (i+1)*batch_size]]
        y = y_train[indices[i*batch_size: (i+1)*batch_size]]
        
        # Forward propagation
        z1 = torch.mm(x, w1) + b1
        a1 = torch.sigmoid(z1)
        z2 = torch.mm(a1, w2) + b2
        out = torch.softmax(z2, dim=1)

        loss = torch.nn.functional.cross_entropy(out, y)
        acc = accuracy(out, y)

        # Backward propagation
        grad_z2 = (z2 - y) / y.shape[0]
        grad_w2 = torch.mm(a1.T, grad_z2)
        grad_b2 = torch.sum(grad_z2, dim=0)

        grad_a1 = torch.mm(grad_z2, w2.T)
        grad_z1 = a1 * (1 - a1) * grad_a1
        grad_w1 = torch.mm(x.T, grad_z1)
        grad_b1 = torch.sum(grad_z1, dim=0)

        # Update weights and biases
        w1 -= learning_rate * grad_w1
        b1 -= learning_rate * grad_b1
        w2 -= learning_rate * grad_w2
        b2 -= learning_rate * grad_b2
        
        batch_loss += loss.item()
        batch_acc += acc.item()

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {batch_loss/(i+1):.3f} acc: {batch_acc/(i+1):.3f}")

[10/100] loss: 2.242 acc: 0.660
[20/100] loss: 2.240 acc: 0.695
[30/100] loss: 2.238 acc: 0.712
[40/100] loss: 2.237 acc: 0.723
[50/100] loss: 2.235 acc: 0.732
[60/100] loss: 2.234 acc: 0.740
[70/100] loss: 2.233 acc: 0.752
[80/100] loss: 2.232 acc: 0.759
[90/100] loss: 2.231 acc: 0.767
[100/100] loss: 2.230 acc: 0.770


In [15]:
x_test = torch.tensor(x_test_scaled).float().to(device)
y_test = torch.tensor(y_test_onehot).float().to(device)

with torch.no_grad():
    # Forward propagation
    z1 = torch.mm(x_test, w1) + b1
    a1 = torch.sigmoid(z1)
    z2 = torch.mm(a1, w2) + b2
    y_pred = torch.softmax(z2, dim=1)

    loss = torch.nn.functional.cross_entropy(y_pred, y_test)
    acc = accuracy(y_pred, y_test)

print(f"loss: {loss.item():.3f} acc: {acc.item():.3f}")

loss: 2.231 acc: 0.769
