In [13]:
import numpy as np
import csv
from google.colab import drive

In [14]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
class ActivationFunction:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sigmoid_derivative(x):
        return x * (1 - x)

    @staticmethod
    def relu(x):
        return np.maximum(0, x)

    @staticmethod
    def relu_derivative(x):
        return (x > 0).astype(float)

    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def tanh_derivative(x):
        return 1 - x**2

    @staticmethod
    def softmax(x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [16]:
class MLP:
    def __init__(self, layer_sizes, activation='sigmoid'):

        self.layer_sizes = layer_sizes
        self.activation_name = activation
        self.L = len(layer_sizes) - 1

        self.weights = []
        self.biases = []
        for i in range(self.L):
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2.0/layer_sizes[i]))
            self.biases.append(np.zeros((1, layer_sizes[i+1])))

        self.velocity_w = [np.zeros_like(w) for w in self.weights]
        self.velocity_b = [np.zeros_like(b) for b in self.biases]

    def forward(self, X):

        self.activations = [X]
        self.zs = []

        for i in range(self.L - 1):
            z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            self.zs.append(z)

            if self.activation_name == 'sigmoid':
                a = ActivationFunction.sigmoid(z)
            elif self.activation_name == 'relu':
                a = ActivationFunction.relu(z)
            elif self.activation_name == 'tanh':
                a = ActivationFunction.tanh(z)
            else:
                raise ValueError(f"Unknown activation: {self.activation_name}")

            self.activations.append(a)

        if self.L > 0:
            z = np.dot(self.activations[-1], self.weights[-1]) + self.biases[-1]
            self.zs.append(z)
            a = ActivationFunction.softmax(z)
            self.activations.append(a)

        return self.activations[-1]

    def backward(self, X, y, learning_rate, momentum=0.0):

        m = X.shape[0]

        y_one_hot = np.zeros((m, self.layer_sizes[-1]))
        y_one_hot[np.arange(m), y.astype(int)] = 1

        predictions = self.forward(X)

        dW = [np.zeros_like(w) for w in self.weights]
        db = [np.zeros_like(b) for b in self.biases]

        error = predictions - y_one_hot

        for l in reversed(range(self.L)):
            if l == self.L - 1:
                dW[l] = np.dot(self.activations[l].T, error) / m
                db[l] = np.sum(error, axis=0, keepdims=True) / m
            else:
                if self.activation_name == 'sigmoid':
                    activation_derivative = ActivationFunction.sigmoid_derivative(self.activations[l+1])
                elif self.activation_name == 'relu':
                    activation_derivative = ActivationFunction.relu_derivative(self.activations[l+1])
                elif self.activation_name == 'tanh':
                    activation_derivative = ActivationFunction.tanh_derivative(self.activations[l+1])

                error = np.dot(error, self.weights[l+1].T) * activation_derivative
                dW[l] = np.dot(self.activations[l].T, error) / m
                db[l] = np.sum(error, axis=0, keepdims=True) / m

        for l in range(self.L):
            self.velocity_w[l] = momentum * self.velocity_w[l] + learning_rate * dW[l]
            self.velocity_b[l] = momentum * self.velocity_b[l] + learning_rate * db[l]
            self.weights[l] -= self.velocity_w[l]
            self.biases[l] -= self.velocity_b[l]

        return self.weights, self.biases

    def predict(self, X):

        predictions = self.forward(X)
        return np.argmax(predictions, axis=1)

    def accuracy(self, X, y):

        predictions = self.predict(X)
        return np.mean(predictions == y)


In [17]:
def load_data(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as file:
        reader = csv.reader(file)

        next(reader)

        for row in reader:
            values = np.array(row, dtype=float)
            labels.append(values[0])
            data.append(values[1:])

    x = np.array(data, dtype=np.float32) / 255.0
    y = np.array(labels, dtype=np.int64)

    return x, y

In [18]:


file_path = "/content/drive/MyDrive/mnist_train.csv"


x, y = load_data(file_path)



In [19]:
def backprop_single_layer(x, y, weights, biases, learning_rate):

    m = x.shape[0]
    num_classes = weights[0].shape[1]

    y_one_hot = np.zeros((m, num_classes))
    y_one_hot[np.arange(m), y.astype(int)] = 1


    z = np.dot(x, weights[0]) + biases[0]


    predictions = ActivationFunction.softmax(z)

    loss = -np.sum(y_one_hot * np.log(predictions + 1e-8)) / m


    error = predictions - y_one_hot


    dW = np.dot(x.T, error) / m


    db = np.sum(error, axis=0, keepdims=True) / m


    weights[0] -= learning_rate * dW
    biases[0] -= learning_rate * db

    return weights, biases, loss

In [20]:
def backprop_two_layer(x, y, weights, biases, learning_rate):

    m = x.shape[0]

    y_one_hot = np.zeros((m, 10))
    y_one_hot[np.arange(m), y.astype(int)] = 1


    z1 = np.dot(x, weights[0]) + biases[0]
    a1 = ActivationFunction.sigmoid(z1)

    z2 = np.dot(a1, weights[1]) + biases[1]
    predictions = ActivationFunction.softmax(z2)

    loss = -np.sum(y_one_hot * np.log(predictions + 1e-8)) / m


    error2 = predictions - y_one_hot
    dW2 = np.dot(a1.T, error2) / m
    db2 = np.sum(error2, axis=0, keepdims=True) / m


    error1 = np.dot(error2, weights[1].T) * ActivationFunction.sigmoid_derivative(a1)
    dW1 = np.dot(x.T, error1) / m
    db1 = np.sum(error1, axis=0, keepdims=True) / m

    weights[0] -= learning_rate * dW1
    weights[1] -= learning_rate * dW2
    biases[0] -= learning_rate * db1
    biases[1] -= learning_rate * db2

    return weights, biases, loss



In [21]:
def backprop_multi_layer(x, y, weights, biases, learning_rate):

    m = x.shape[0]
    num_layers = len(weights)
    num_classes = weights[-1].shape[1]

    y_one_hot = np.zeros((m, num_classes))
    y_one_hot[np.arange(m), y.astype(int)] = 1

    activations = [x]
    zs = []

    for l in range(num_layers - 1):
        z = np.dot(activations[-1], weights[l]) + biases[l]
        zs.append(z)
        a = ActivationFunction.sigmoid(z)
        activations.append(a)

    z = np.dot(activations[-1], weights[-1]) + biases[-1]
    zs.append(z)
    predictions = ActivationFunction.softmax(z)
    activations.append(predictions)

    loss = -np.sum(y_one_hot * np.log(predictions + 1e-8)) / m

    dW = [np.zeros_like(w) for w in weights]
    db = [np.zeros_like(b) for b in biases]

    error = predictions - y_one_hot
    dW[-1] = np.dot(activations[-2].T, error) / m
    db[-1] = np.sum(error, axis=0, keepdims=True) / m

    for l in reversed(range(num_layers - 1)):
        error = np.dot(error, weights[l+1].T) * ActivationFunction.sigmoid_derivative(activations[l+1])
        dW[l] = np.dot(activations[l].T, error) / m
        db[l] = np.sum(error, axis=0, keepdims=True) / m

    for l in range(num_layers):
        weights[l] -= learning_rate * dW[l]
        biases[l] -= learning_rate * db[l]

    return weights, biases, loss

In [22]:
def backprop_multi_layer_activation(x, y, weights, biases, learning_rate, activation='sigmoid'):

    m = x.shape[0]
    num_layers = len(weights)
    num_classes = weights[-1].shape[1]

    y_one_hot = np.zeros((m, num_classes))
    y_one_hot[np.arange(m), y.astype(int)] = 1

    if activation == 'sigmoid':
        activation_func = ActivationFunction.sigmoid
        activation_derivative = ActivationFunction.sigmoid_derivative
    elif activation == 'relu':
        activation_func = ActivationFunction.relu
        activation_derivative = ActivationFunction.relu_derivative
    elif activation == 'tanh':
        activation_func = ActivationFunction.tanh
        activation_derivative = ActivationFunction.tanh_derivative
    else:
        raise ValueError(f"Unsupported activation: {activation}")

    activations = [x]
    zs = []

    for l in range(num_layers - 1):
        z = np.dot(activations[-1], weights[l]) + biases[l]
        zs.append(z)
        a = activation_func(z)
        activations.append(a)

    z = np.dot(activations[-1], weights[-1]) + biases[-1]
    zs.append(z)
    predictions = ActivationFunction.softmax(z)
    activations.append(predictions)

    loss = -np.sum(y_one_hot * np.log(predictions + 1e-8)) / m

    dW = [np.zeros_like(w) for w in weights]
    db = [np.zeros_like(b) for b in biases]

    error = predictions - y_one_hot
    dW[-1] = np.dot(activations[-2].T, error) / m
    db[-1] = np.sum(error, axis=0, keepdims=True) / m

    for l in reversed(range(num_layers - 1)):
        error = np.dot(error, weights[l+1].T) * activation_derivative(activations[l+1])
        dW[l] = np.dot(activations[l].T, error) / m
        db[l] = np.sum(error, axis=0, keepdims=True) / m

    for l in range(num_layers):
        weights[l] -= learning_rate * dW[l]
        biases[l] -= learning_rate * db[l]

    return weights, biases, loss



In [23]:
def train_with_momentum(x_train, y_train, weights, biases, learning_rate=0.01,
                        momentum=0.9, epochs=100, batch_size=None, activation='sigmoid'):

    num_layers = len(weights)
    m = x_train.shape[0]

    velocity_w = [np.zeros_like(w) for w in weights]
    velocity_b = [np.zeros_like(b) for b in biases]

    if activation == 'sigmoid':
        activation_func = ActivationFunction.sigmoid
        activation_derivative = ActivationFunction.sigmoid_derivative
    elif activation == 'relu':
        activation_func = ActivationFunction.relu
        activation_derivative = ActivationFunction.relu_derivative
    elif activation == 'tanh':
        activation_func = ActivationFunction.tanh
        activation_derivative = ActivationFunction.tanh_derivative

    losses = []

    print(f"  Training for {epochs} epochs with momentum = {momentum}")

    for epoch in range(epochs):
        indices = np.random.permutation(m)
        x_shuffled = x_train[indices]
        y_shuffled = y_train[indices]

        activations = [x_shuffled]
        zs = []

        for l in range(num_layers - 1):
            z = np.dot(activations[-1], weights[l]) + biases[l]
            zs.append(z)
            a = activation_func(z)
            activations.append(a)

        z = np.dot(activations[-1], weights[-1]) + biases[-1]
        zs.append(z)
        predictions = ActivationFunction.softmax(z)
        activations.append(predictions)

        y_one_hot = np.zeros((m, weights[-1].shape[1]))
        y_one_hot[np.arange(m), y_shuffled.astype(int)] = 1

        loss = -np.sum(y_one_hot * np.log(predictions + 1e-8)) / m
        losses.append(loss)

        dW = [np.zeros_like(w) for w in weights]
        db = [np.zeros_like(b) for b in biases]

        error = predictions - y_one_hot
        dW[-1] = np.dot(activations[-2].T, error) / m
        db[-1] = np.sum(error, axis=0, keepdims=True) / m

        for l in reversed(range(num_layers - 1)):
            error = np.dot(error, weights[l+1].T) * activation_derivative(activations[l+1])
            dW[l] = np.dot(activations[l].T, error) / m
            db[l] = np.sum(error, axis=0, keepdims=True) / m

        for l in range(num_layers):
            velocity_w[l] = momentum * velocity_w[l] + learning_rate * dW[l]
            velocity_b[l] = momentum * velocity_b[l] + learning_rate * db[l]
            weights[l] -= velocity_w[l]
            biases[l] -= velocity_b[l]

        if epoch % 10 == 0:
            print(f"    Epoch {epoch:3d}: Loss = {loss:.4f}")

    return weights, biases, losses

