In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
class Linear:
    def __init__(self, in_features, out_features, bias=True):
        self.weight = np.random.randn(out_features, in_features) * np.sqrt(2.0 / in_features)
        self.bias = np.zeros(out_features) if bias else None
        self.grad_weight = np.zeros_like(self.weight)
        self.grad_bias = np.zeros_like(self.bias) if bias else None

    def forward(self, X):
        self.input = X
        return np.matmul(X, self.weight.T) + self.bias

    def backward(self, grad_output):
        self.grad_weight = np.matmul(grad_output.T, self.input)
        if self.bias is not None:
            self.grad_bias = np.sum(grad_output, axis=0)
        return np.matmul(grad_output, self.weight)

    def zero_grad(self):
        self.grad_weight.fill(0)
        if self.bias is not None:
            self.grad_bias.fill(0)

    def parameters(self):
        params = [{'value': self.weight, 'grad': self.grad_weight}]
        if self.bias is not None:
            params.append({'value': self.bias, 'grad': self.grad_bias})
        return params

class ReLU:
    def forward(self, x):
        self.input = x
        return np.maximum(0, x)

    def backward(self, grad_output):
        return grad_output * (self.input > 0)

class Softmax:
    def forward(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        self.output = exp_x / np.sum(exp_x, axis=-1, keepdims=True)
        return self.output

    def backward(self, grad_output):
        return grad_output

class MLP:
    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad_output):
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output)

    def zero_grad(self):
        for layer in self.layers:
            if hasattr(layer, 'zero_grad'):
                layer.zero_grad()

    def parameters(self):
        params = []
        for layer in self.layers:
            if hasattr(layer, 'parameters'):
                params.extend(layer.parameters())
        return params

def CrossEntropyLoss(y_true, y_pred, model, lambda_reg=0.00001):
    y_true = y_true.flatten()
    y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)  # Prevent log(0)
    loss = -np.log(y_pred[np.arange(len(y_true)), y_true])
    # L2 Regularization Term: Sum of squared weights
    l2_reg = 0
    for param in model.parameters():
        l2_reg += np.sum(param['value'] ** 2)
    # Include the regularization term in the loss
    return np.mean(loss) + (lambda_reg / 2) * l2_reg

def CrossEntropyLossBackward(y_true, y_pred):
    y_true = y_true.flatten()
    grad = y_pred.copy()
    grad[np.arange(len(y_true)), y_true] -= 1
    return grad / len(y_true)

class StochasticGradientDescentWithMomentum:
    def __init__(self, parameters, lr, momentum=0.9):
        self.parameters = parameters
        self.lr = lr
        self.momentum = momentum
        # Initialize velocity for each parameter gradient to zero
        self.velocities = [{'grad': np.zeros_like(param['grad'])} for param in parameters]

    def step(self):
        # Update parameters with momentum
        for param, velocity in zip(self.parameters, self.velocities):
            # Compute the velocity update with the momentum term
            velocity['grad'] = self.momentum * velocity['grad'] + param['grad']
            # Update the parameter using the velocity scaled by learning rate
            param['value'] -= self.lr * velocity['grad']


Epoch 1, Loss: 2.304229781651375
Epoch 2, Loss: 2.304246532141078
Epoch 3, Loss: 2.3042389861327033
Epoch 4, Loss: 2.3042450738534472
Epoch 5, Loss: 2.3042416304869584
Epoch 6, Loss: 2.3042460072738375
Epoch 7, Loss: 2.304235586380251
Epoch 8, Loss: 2.30423101829231
Epoch 9, Loss: 2.3042455857886393
Epoch 10, Loss: 2.3042517581310467
Epoch 11, Loss: 2.3042415998487633
Epoch 12, Loss: 2.304237221431178
Epoch 13, Loss: 2.3042469299317285
Epoch 14, Loss: 2.304239810198513
Epoch 15, Loss: 2.30424606657805
Epoch 16, Loss: 2.3042382420315817
Epoch 17, Loss: 2.304248043134171
Epoch 18, Loss: 2.3042407357412698
Epoch 19, Loss: 2.304237094757015
Epoch 20, Loss: 2.304248916864858
Epoch 21, Loss: 2.3042524454588604
Epoch 22, Loss: 2.304240381057896
Epoch 23, Loss: 2.304242399751738
Epoch 24, Loss: 2.3042424035672777
Epoch 25, Loss: 2.3042466082818174
Epoch 26, Loss: 2.3042336726432784
Epoch 27, Loss: 2.30424259993602
Epoch 28, Loss: 2.3042454270067645
Epoch 29, Loss: 2.304245961183281
Epoch 30, L

In [3]:
train_data = pd.read_csv('/Users/matthew/Downloads/archive/mnist_train.csv')
test_data = pd.read_csv('/Users/matthew/Downloads/archive/mnist_test.csv')

print(train_data.shape, test_data.shape)

(60000, 785) (10000, 785)


In [4]:
y_train = train_data.iloc[:, [0]]
X_train = train_data.iloc[:, 1:]
y_test = test_data.iloc[:, [0]]
X_test = test_data.iloc[:, 1:]

print(f"x_label shape: {y_train.shape}")
print(f"x_train shape: {X_train.shape}")
print(f"x_test shape: {y_test.shape}")
print(f"x_test shape: {X_test.shape}")

x_label shape: (60000, 1)
x_train shape: (60000, 784)
x_test shape: (10000, 1)
x_test shape: (10000, 784)


In [5]:
y_train = y_train.to_numpy()
X_train = X_train.to_numpy()
y_test = y_test.to_numpy()
X_test = X_test.to_numpy()


In [19]:
print(np.max(X_train[1]))
print(X_train)

255
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
model = MLP(Linear(784, 128))
model.add(ReLU())
model.add(Linear(128, 32))
model.add(ReLU())
model.add(Linear(32, 10))
model.add(Softmax())

# Define the optimizer
optimizer = StochasticGradientDescentWithMomentum(model.parameters(), lr=0.0001, momentum=0.9)

# Training settings
epochs = 30
batch_size = 32
num_batches = X_train.shape[0] // batch_size

# Normalizing the input data
X_train = X_train/255
X_test = X_test/255

for epoch in range(epochs):
    model.zero_grad()  # Reset gradients at the start of each epoch
    epoch_loss = 0

    for batch in range(num_batches):
        # Randomly select a batch of indices
        batch_indices = np.random.choice(X_train.shape[0], batch_size, replace=False)
        X_batch = X_train[batch_indices]
        y_batch = y_train[batch_indices]

        model.zero_grad()

        predictions = model.forward(X_batch)

        loss = CrossEntropyLoss(y_batch, predictions, model, lambda_reg=0.00001)
        epoch_loss += loss

        grad_loss = CrossEntropyLossBackward(y_batch, predictions)
        model.backward(grad_loss)

        optimizer.step()

    # Averaging loss over the number of batches
    epoch_loss /= num_batches
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss}")


In [21]:
def evaluate_accuracy(model, x_test, y_test):
    predictions = model.forward(x_test)
    
    predicted_labels = np.argmax(predictions, axis=1)
    
    true_labels = y_test.flatten()
    
    accuracy = np.mean(predicted_labels == true_labels)
    
    return accuracy

accuracy = evaluate_accuracy(model, X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 10.51%
