In [1]:
import numpy as np

In [2]:
#functions of non-linear activations
def f_sigmoid(X, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-X))
    else:
        return f_sigmoid(X)*(1 - f_sigmoid(X))


def f_softmax(X):
    Z = np.sum(np.exp(X), axis=1)
    Z = Z.reshape(Z.shape[0], 1)
    return np.exp(X) / Z


In [12]:
import sys
def exit_with_err(err_str):
    print(err_str, file=sys.stderr)
    sys.exit(1)

In [4]:
#Functionality of a single hidden layer
class Layer:
    def __init__(self, size, batch_size, is_input=False, is_output=False,
                 activation=f_sigmoid):
        self.is_input = is_input
        self.is_output = is_output

        # Z is the matrix that holds output values
        self.Z = np.zeros((batch_size, size[0]))
        # The activation function is an externally defined function (with a
        # derivative) that is stored here
        self.activation = activation

        # W is the outgoing weight matrix for this layer
        self.W = None
        # S is the matrix that holds the inputs to this layer
        self.S = None
        # D is the matrix that holds the deltas for this layer
        self.D = None
        # Fp is the matrix that holds the derivatives of the activation function
        self.Fp = None

        if not is_input:
            self.S = np.zeros((batch_size, size[0]))
            self.D = np.zeros((batch_size, size[0]))

        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)

        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], batch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)


In [20]:
class MultiLayerPerceptron:
    def __init__(self, layer_config, batch_size=100):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = batch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print ("Initializing input layer with size {0}.".format(layer_config[i]))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         is_input=True))
            else:
                print ("Initializing hidden layer with size {0}.".format(layer_config[i]))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         activation=f_sigmoid))

        print ("Initializing output layer with size {0}.".format(layer_config[-1]))
        self.layers.append(Layer([layer_config[-1], None],
                                 batch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print ("Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        
        #exit_with_err("FIND ME IN THE CODE, What is computed in the next line of code?\n")
        #This line computes the error at the output layer by subtracting the true labels from the predicted outputs.
        self.layers[-1].D = (yhat - labels).T
        for i in range(self.num_layers-2, 0, -1):
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :]
            
            #exit_with_err("FIND ME IN THE CODE, What does this 'for' loop do?\n")
            #This loop iterates backward through the hidden layers of the network, calculating the delta values 
            # for each layer based on the deltas of the next layer and the weights connecting them.
            
            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * self.layers[i].Fp

    def update_weights(self, eta):
        for i in range(0, self.num_layers-1):
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=70, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print ("Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)
            
            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                
                #exit_with_err("FIND ME IN THE CODE, How does weight update is implemented? What is eta?\n")
                # This line updates the weights of the network using the computed deltas and the learning rate (eta). 
                # Eta is a hyperparameter that controls how much to change the weights during each update.
                self.update_weights(eta=eta)

            if eval_train:
                errs = 0
                accuracy_train = 0.0
        
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    accuracy_train += np.sum(yhat == np.argmax(b_labels, axis=1))
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Training error: {1:.5f} and Training accuracy: {2:.5f}".format(out_str,
                                                           float(errs)/N_train,
                                                           accuracy_train/N_train))
                

            if eval_test:
                errs = 0
                accuracy_test = 0.0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    accuracy_test += np.sum(yhat == np.argmax(b_labels, axis=1))
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Test error: {1:.5f} and Test accuracy: {2:.5f}").format(out_str,
                                                       float(errs)/N_test,
                                                       accuracy_test/N_test)

            print (out_str)


In [6]:
def label_to_bit_vector(labels, nbits):
    bit_vector = np.zeros((labels.shape[0], nbits))
    for i in range(labels.shape[0]):
        bit_vector[i, labels[i]] = 1.0

    return bit_vector

In [7]:
def create_batches(data, labels, batch_size, create_bit_vector=False):
    N = data.shape[0]
    print ("Batch size {0}, the number of examples {1}.".format(batch_size,N))

    if N % batch_size != 0:
        print ("Warning in create_minibatches(): Batch size {0} does not " \
              "evenly divide the number of examples {1}.".format(batch_size,N))
    chunked_data = []
    chunked_labels = []
    idx = 0
    while idx + batch_size <= N:
        chunked_data.append(data[idx:idx+batch_size, :])
        if not create_bit_vector:
            chunked_labels.append(labels[idx:idx+batch_size])
        else:
            bit_vector = label_to_bit_vector(labels[idx:idx+batch_size], 10)
            chunked_labels.append(bit_vector)

        idx += batch_size

    return chunked_data, chunked_labels


In [8]:
def prepare_for_backprop(batch_size, Train_images, Train_labels, Valid_images, Valid_labels):
    
    print ("Creating data...")
    batched_train_data, batched_train_labels = create_batches(Train_images, Train_labels,
                                              batch_size,
                                              create_bit_vector=True)
    batched_valid_data, batched_valid_labels = create_batches(Valid_images, Valid_labels,
                                              batch_size,
                                              create_bit_vector=True)
    print ("Done!")


    return batched_train_data, batched_train_labels,  batched_valid_data, batched_valid_labels



In [9]:
from keras.datasets import mnist

2025-11-25 11:15:49.641124: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
(Xtr, Ltr), (X_test, L_test)=mnist.load_data()

Xtr = Xtr.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
Xtr = Xtr.astype('float32')
X_test = X_test.astype('float32')
Xtr /= 255
X_test /= 255
print(Xtr.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')


60000 train samples
10000 test samples


In [21]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True)

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.44910 and Training accuracy: 0.55090 Test error: 0.44660 and Test accuracy: 0.55340
[   1]  Training error: 0.07512 and Training accuracy: 0.92488 Test error: 0.07370 and Test accuracy: 0.92630
[   2]  Training error: 0.05305 and Training accuracy: 0.94695 Test error: 0.05800 and Test accuracy: 0.94200
[   3]  Training error: 0.03835 and Training accuracy: 0.96165 Test error: 0.04390 and Test accuracy: 0.95610
[   4]  Training error: 0.03050 and Training accuracy: 0.96950 Test error: 0.03790 and Test accuracy: 0.96210
[   5]  Training error: 0.02765 and Training accuracy: 0.97235 Test error: 0.03600 and Test accuracy: 0.96400
[   6]  Training error: 0.02750

### Task 3.1

1.a The principle of backpropagation is that after doing the forward pass and getting a prediction, the model computes how wrong it was using a loss function. The model then goes backward through the network and computes the gradients of this loss for every weight in the model. This is done by using the chain rule to circumvant the problem of calculating a differential. These gradients tell the algorithm how each weight influenced the error and how much. Gradient descent then updates all weights by small amounts in the direction that reduces the loss. This process is repeated layer by layer from the output back to the input.

1.b Softmax is an activation function used at the output of a classifier. It takes all the scores of the last layer and converts them into probabilities that sum to 1. This lets the network express how likely each class is and enables easy predictions. Softmax is usually used with the cross entropy loss which measures how far the predicted probability distribution is from the correct class.

1.c Several different output activation functions exist depending on the task. Sigmoid is used for binary classification. Softmax is used for multi-class classification. Linear output is used for regression. 


2. The classification accuracy on test is 97.4\%

In [22]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

etas = [0.005, 0.5]
for eta in etas:
    print(f"Evaluating with eta={eta}")
    mlp.evaluate(train_data, train_labels, valid_data, valid_labels,eta=eta,
                 eval_train=True)

    print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Evaluating with eta=0.005
Training for 70 epochs...
[   0]  Training error: 0.70335 and Training accuracy: 0.29665 Test error: 0.70090 and Test accuracy: 0.29910
[   1]  Training error: 0.64730 and Training accuracy: 0.35270 Test error: 0.64320 and Test accuracy: 0.35680
[   2]  Training error: 0.59963 and Training accuracy: 0.40037 Test error: 0.59790 and Test accuracy: 0.40210
[   3]  Training error: 0.46410 and Training accuracy: 0.53590 Test error: 0.47480 and Test accuracy: 0.52520
[   4]  Training error: 0.20897 and Training accuracy: 0.79103 Test error: 0.19900 and Test accuracy: 0.80100
[   5]  Training error: 0.11450 and Training accuracy: 0.88550 Test error: 0.11060 and Test accuracy: 0.88940
[   6

  return 1 / (1 + np.exp(-X))


[   0]  Training error: 0.88763 and Training accuracy: 0.11237 Test error: 0.88650 and Test accuracy: 0.11350
[   1]  Training error: 0.90070 and Training accuracy: 0.09930 Test error: 0.89680 and Test accuracy: 0.10320
[   2]  Training error: 0.88763 and Training accuracy: 0.11237 Test error: 0.88650 and Test accuracy: 0.11350
[   3]  Training error: 0.90248 and Training accuracy: 0.09752 Test error: 0.90260 and Test accuracy: 0.09740
[   4]  Training error: 0.90128 and Training accuracy: 0.09872 Test error: 0.90200 and Test accuracy: 0.09800
[   5]  Training error: 0.88763 and Training accuracy: 0.11237 Test error: 0.88650 and Test accuracy: 0.11350
[   6]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   7]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   8]  Training error: 0.90085 and Training accuracy: 0.09915 Test error: 0.89910 and Test accuracy: 0.10090
[   9]  Tr

3. With $eta = 0.005$ the learning does happen but it takes more epochs to do so. The errors also don't fluctuate as much as with a higher eta.
With $eta = 0.5$ the learning never happens so the error remains very high and the accuracy is very low (close to 0).

In [32]:
#Functionality of a single hidden layer

def f_Relu(X, deriv=False):
    if deriv:
        return (X > 0).astype(float)
    return np.maximum(0, X)

    
class Layer_ReLu:
    def __init__(self, size, batch_size, is_input=False, is_output=False,
                 activation=f_Relu):
        self.is_input = is_input
        self.is_output = is_output

        # Z is the matrix that holds output values
        self.Z = np.zeros((batch_size, size[0]))
        # The activation function is an externally defined function (with a
        # derivative) that is stored here
        self.activation = activation

        # W is the outgoing weight matrix for this layer
        self.W = None
        # S is the matrix that holds the inputs to this layer
        self.S = None
        # D is the matrix that holds the deltas for this layer
        self.D = None
        # Fp is the matrix that holds the derivatives of the activation function
        self.Fp = None

        if not is_input:
            self.S = np.zeros((batch_size, size[0]))
            self.D = np.zeros((batch_size, size[0]))

        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)

        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], batch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)


class MultiLayerPerceptron_ReLu:
    def __init__(self, layer_config, batch_size=100):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = batch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print ("Initializing input layer with size {0}.".format(layer_config[i]))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer_ReLu([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         is_input=True))
            else:
                print ("Initializing hidden layer with size {0}.".format(layer_config[i]))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                self.layers.append(Layer_ReLu([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         activation=f_Relu))

        print ("Initializing output layer with size {0}.".format(layer_config[-1]))
        self.layers.append(Layer_ReLu([layer_config[-1], None],
                                 batch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print ("Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        
        #exit_with_err("FIND ME IN THE CODE, What is computed in the next line of code?\n")
        #This line computes the error at the output layer by subtracting the true labels from the predicted outputs.
        self.layers[-1].D = (yhat - labels).T
        for i in range(self.num_layers-2, 0, -1):
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :]
            
            #exit_with_err("FIND ME IN THE CODE, What does this 'for' loop do?\n")
            #This loop iterates backward through the hidden layers of the network, calculating the delta values 
            # for each layer based on the deltas of the next layer and the weights connecting them.
            
            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * self.layers[i].Fp

    def update_weights(self, eta):
        for i in range(0, self.num_layers-1):
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=70, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print ("Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)
            
            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                
                #exit_with_err("FIND ME IN THE CODE, How does weight update is implemented? What is eta?\n")
                # This line updates the weights of the network using the computed deltas and the learning rate (eta). 
                # Eta is a hyperparameter that controls how much to change the weights during each update.
                self.update_weights(eta=eta)

            if eval_train:
                errs = 0
                accuracy_train = 0.0
        
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    accuracy_train += np.sum(yhat == np.argmax(b_labels, axis=1))
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Training error: {1:.5f} and Training accuracy: {2:.5f}".format(out_str,
                                                           float(errs)/N_train,
                                                           accuracy_train/N_train))
                

            if eval_test:
                errs = 0
                accuracy_test = 0.0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    accuracy_test += np.sum(yhat == np.argmax(b_labels, axis=1))
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Test error: {1:.5f} and Test accuracy: {2:.5f}").format(out_str,
                                                       float(errs)/N_test,
                                                       accuracy_test/N_test)

            print (out_str)


In [38]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron_ReLu(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,eta=5e-3,
             eval_train=True)

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   1]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   2]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   3]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   4]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   5]  Training error: 0.89558 and Training accuracy: 0.10442 Test error: 0.89720 and Test accuracy: 0.10280
[   6]  Training error: 0.89558

4. With a learning rate of 0.05 no learning is done because the model can't get out of the initialisation of weights with negative and small numbers. With an eta of 0.005 the learning begins after 12 epochs and the results are similar to sigmoid.