In [1]:
import numpy as np

class StepFunction:
    @staticmethod
    def apply(x):
        return np.where(x > 0, 1, 0)

class Sigmoid:
    @staticmethod
    def apply(x):
        return 1 / (1 + np.exp(-x))
    
    def apply_(x):
        return np.exp(-x) / (1 + np.exp(-x)) ** 2

class ReLU:
    @staticmethod
    def apply(x):
        return np.maximum(0, x)

    def apply_(x):
        return np.where(x > 0, 1, 0)
    
class Softmax:
    @staticmethod
    def apply(x):
        return np.exp(x) / np.exp(x).sum()
        
class MSE:
    @staticmethod
    def loss(y_pred, y_true):
        return np.mean((y_true - y_pred) ** 2)
    
    def loss_(y_pred, y_true):
        return 2 * (y_true - y_pred)
        

class Layer:
    def __init__(self, weights,  biases=None, activation_func=Sigmoid):
    # you can choose whether to add bias manually or use padding as we saw in class
        self.weights = weights
        self.activation_func = activation_func
        if biases is None:
            self.biases = np.zeros(self.weights.shape[0])
        else:
            self.biases = biases
        
    
    def apply(self, x_in):
    # calculate the layer output on the given x_in using layer weights and biases
        output = np.dot(self.weights, x_in) + self.biases
        self.x_out = output
        self.x_in = x_in
        return self.activation_func.apply(output)
    
    def backwards(self, downstream_grad):
    # calculate the gradient of the layer weights and pad on the gradient to upstream layers
    # this implementation assumes biases are padded to the weights matrix
        x_out_grad = self.activation_func.apply_(self.x_out)
        downstream_grad = downstream_grad * x_out_grad
        self.weights_grad = np.outer(downstream_grad, self.x_in)
        self.bias_grad = downstream_grad
        downstream_grad = downstream_grad @ self.weights
        return downstream_grad


class Network:
    def __init__(self, layers=None, loss_func=MSE):
    # initialize the network with the given layers
        if layers is None:
            self.layers = []
        else:
            self.layers = layers
        self.check_shapes()

    def add_layer(self, new_layer, pos = None):
    # add a layer to the network
    # bonus - do not assume valid input, check the new layer shape matches the existing layers
        if pos is None:
            self.layers.insert(len(self.layers), new_layer)
        else:
            self.layers.insert(pos, new_layer)
        self.check_shapes()

    def remove_layer(self, pos = None):
        if pos is None:
            pass
        else:
            del self.layers[pos]
        self.check_shapes()
    
    def check_shapes(self):
        for i in range(1, len(self.layers)):
            assert self.layers[i-1].weights.shape[0] == self.layers[i].weights.shape[1]
            
    def forward(self, x):
    # do a forward pass of the network on the given input
        for i, layer in enumerate(self.layers):
            if i == 0:
                res = layer.apply(x)
            else:
                res = layer.apply(res)
        return res
    
    def backwards(self, loss_grad):
    # do a backwards pass and calculate the network gradients
        for layer in reversed(self.layers):
            loss_grad = layer.backwards(loss_grad)
        return loss_grad

**XOR**

In [2]:
w1 = np.random.normal(3, 1, size=(2,2))
b1 = np.array([0., -1.])
x1 = np.array([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
layer1 = Layer(w1, b1, activation_func=Sigmoid)
w2 = np.random.normal(3, 1, size=(1,2))
layer2 = Layer(w2, activation_func=ReLU)

y_true = [[0.], [1.], [1.], [0.]]

net = Network([layer1, layer2])

total_loss = 0
accuracy = 0

for i, item in enumerate(x1):
    y_pred_prob = net.forward(item)
    loss = MSE.loss(y_true[i], y_pred_prob)
    total_loss += loss / x1.shape[0]
    
    if y_pred_prob == y_true[i]:
        acc = 1
    else:
        acc = 0    
    
    accuracy += acc / len(y_true)
    
print('XOR loss is:', round(total_loss, 2))
print('XOR accuracy is:', round(accuracy, 2))

XOR loss is: 10.03
XOR accuracy is: 0.0


**XOR Training**

In [3]:
total_loss = 0

epochs = 1000

lr = 0.1

accuracy = 0

for i in range(epochs):
    for j in range(len(y_true)):
        y_t = y_true[j]
        x_t = x1[j]
        
        y_pred = net.forward(x_t)
        loss = MSE.loss(y_t, y_pred)
        loss_ = MSE.loss_(y_t, y_pred)
        
        total_loss += loss / x1.shape[0]
        
        net.backwards(loss_)
        
        for layer in net.layers:
            layer.weights -= layer.weights_grad * lr
            layer.biases -= layer.bias_grad * lr
                
        if round(y_pred[0], 10) == round(y_t[0], 10):
            acc = 1
        else:
            acc = 0    
    
        accuracy += acc / len(y_true)
    
    if i % 100 == 0:
        print('Iteration', i, 'has a total loss of:', total_loss)
        print('Iteration', i, 'has a total accuracy of:', accuracy)
    total_loss = 0
    accuracy = 0

Iteration 0 has a total loss of: 3.216951415935082
Iteration 0 has a total accuracy of: 0.0
Iteration 100 has a total loss of: 0.5173908018387787
Iteration 100 has a total accuracy of: 0.25
Iteration 200 has a total loss of: 0.5123842571077227
Iteration 200 has a total accuracy of: 0.25
Iteration 300 has a total loss of: 0.5101689330266611
Iteration 300 has a total accuracy of: 0.25
Iteration 400 has a total loss of: 0.34505711048560206
Iteration 400 has a total accuracy of: 0.0
Iteration 500 has a total loss of: 0.28296525837462294
Iteration 500 has a total accuracy of: 0.0
Iteration 600 has a total loss of: 0.248014174257486
Iteration 600 has a total accuracy of: 0.0
Iteration 700 has a total loss of: 0.20267058222638143
Iteration 700 has a total accuracy of: 0.0
Iteration 800 has a total loss of: 0.00047028331314161414
Iteration 800 has a total accuracy of: 0.0
Iteration 900 has a total loss of: 1.886641933609764e-07
Iteration 900 has a total accuracy of: 0.0


I will increase the number of epochs to see what happens...

In [4]:
w1 = np.random.normal(3, 1, size=(2,2))
b1 = np.array([0., -1.])
x1 = np.array([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
layer1 = Layer(w1, b1, activation_func=Sigmoid)
w2 = np.random.normal(3, 1, size=(1,2))
layer2 = Layer(w2, activation_func=ReLU)

y_true = [[0.], [1.], [1.], [0.]]

net = Network([layer1, layer2])

total_loss = 0

epochs = 3000

lr = 0.1

accuracy = 0

for i in range(epochs):
    for j in range(len(y_true)):
        y_t = y_true[j]
        x_t = x1[j]
        
        y_pred = net.forward(x_t)
        loss = MSE.loss(y_t, y_pred)
        loss_ = MSE.loss_(y_t, y_pred)
        
        total_loss += loss / x1.shape[0]
        
        net.backwards(loss_)
        
        for layer in net.layers:
            layer.weights -= layer.weights_grad * lr
            layer.biases -= layer.bias_grad * lr
                
        if round(y_pred[0], 10) == round(y_t[0], 10):
            acc = 1
        else:
            acc = 0    
    
        accuracy += acc / len(y_true)
    
    if i % 100 == 0:
        print('Iteration', i, 'has a total loss of:', total_loss)
        print('Iteration', i, 'has a total accuracy of:', accuracy)
    total_loss = 0
    accuracy = 0

Iteration 0 has a total loss of: 5.204799090195619
Iteration 0 has a total accuracy of: 0.0
Iteration 100 has a total loss of: 0.2697454723685644
Iteration 100 has a total accuracy of: 0.0
Iteration 200 has a total loss of: 0.0003840129233990444
Iteration 200 has a total accuracy of: 0.0
Iteration 300 has a total loss of: 2.6595069193625025e-08
Iteration 300 has a total accuracy of: 0.0
Iteration 400 has a total loss of: 6.422005899346851e-13
Iteration 400 has a total accuracy of: 0.25
Iteration 500 has a total loss of: 7.159480107396371e-18
Iteration 500 has a total accuracy of: 0.25
Iteration 600 has a total loss of: 7.87596686841585e-23
Iteration 600 has a total accuracy of: 1.0
Iteration 700 has a total loss of: 8.714078033814042e-28
Iteration 700 has a total accuracy of: 1.0
Iteration 800 has a total loss of: 2.4775162804597402e-30
Iteration 800 has a total accuracy of: 1.0
Iteration 900 has a total loss of: 2.4775162804597402e-30
Iteration 900 has a total accuracy of: 1.0
Iterati

We can see that compared to the first 1000 epochs this one converges to the desired accuracy. Also, with less epochs this could have happened, basically depends on the initialization weights.

In [5]:
w1 = np.random.normal(3, 1, size=(2,2))
b1 = np.array([0., -1.])
x1 = np.array([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
layer1 = Layer(w1, b1, activation_func=Sigmoid)
w2 = np.random.normal(3, 1, size=(1,2))
layer2 = Layer(w2, activation_func=ReLU)

y_true = [[0.], [1.], [1.], [0.]]

net = Network([layer1, layer2])

total_loss = 0

epochs = 3000

lr = 0.01

accuracy = 0

for i in range(epochs):
    for j in range(len(y_true)):
        y_t = y_true[j]
        x_t = x1[j]
        
        y_pred = net.forward(x_t)
        loss = MSE.loss(y_t, y_pred)
        loss_ = MSE.loss_(y_t, y_pred)
        
        total_loss += loss / x1.shape[0]
        
        net.backwards(loss_)
        
        for layer in net.layers:
            layer.weights -= layer.weights_grad * lr
            layer.biases -= layer.bias_grad * lr
                
        if round(y_pred[0], 10) == round(y_t[0], 10):
            acc = 1
        else:
            acc = 0    
    
        accuracy += acc / len(y_true)
    
    if i % 100 == 0:
        print('Iteration', i, 'has a total loss of:', total_loss)
        print('Iteration', i, 'has a total accuracy of:', accuracy)
    total_loss = 0
    accuracy = 0

Iteration 0 has a total loss of: 6.497271881905371
Iteration 0 has a total accuracy of: 0.0
Iteration 100 has a total loss of: 0.2541368934328439
Iteration 100 has a total accuracy of: 0.25
Iteration 200 has a total loss of: 0.23233098090241414
Iteration 200 has a total accuracy of: 0.25
Iteration 300 has a total loss of: 0.22002860258287538
Iteration 300 has a total accuracy of: 0.25
Iteration 400 has a total loss of: 0.2126586290442939
Iteration 400 has a total accuracy of: 0.0
Iteration 500 has a total loss of: 0.2083707490469045
Iteration 500 has a total accuracy of: 0.0
Iteration 600 has a total loss of: 0.2053428025455065
Iteration 600 has a total accuracy of: 0.0
Iteration 700 has a total loss of: 0.20298726779410428
Iteration 700 has a total accuracy of: 0.0
Iteration 800 has a total loss of: 0.2010687746586444
Iteration 800 has a total accuracy of: 0.0
Iteration 900 has a total loss of: 0.199465569041607
Iteration 900 has a total accuracy of: 0.0
Iteration 1000 has a total los

Smaller learning rate doesn't make it improve a lot... It doesn't let it 'escape'...

In [6]:
w1 = np.random.normal(3, 1, size=(2,2))
b1 = np.array([0., -1.])
x1 = np.array([[0., 0.], [0., 1.], [1., 0.], [1., 1.]])
layer1 = Layer(w1, b1, activation_func=Sigmoid)
w2 = np.random.normal(3, 1, size=(1,2))
layer2 = Layer(w2, activation_func=ReLU)

y_true = [[0.], [1.], [1.], [0.]]

net = Network([layer1, layer2])

total_loss = 0

epochs = 3000

lr = 0.4

accuracy = 0

for i in range(epochs):
    for j in range(len(y_true)):
        y_t = y_true[j]
        x_t = x1[j]
        
        y_pred = net.forward(x_t)
        loss = MSE.loss(y_t, y_pred)
        loss_ = MSE.loss_(y_t, y_pred)
        
        total_loss += loss / x1.shape[0]
        
        net.backwards(loss_)
        
        for layer in net.layers:
            layer.weights -= layer.weights_grad * lr
            layer.biases -= layer.bias_grad * lr
                
        if round(y_pred[0], 10) == round(y_t[0], 10):
            acc = 1
        else:
            acc = 0    
    
        accuracy += acc / len(y_true)
    
    if i % 100 == 0:
        print('Iteration', i, 'has a total loss of:', total_loss)
        print('Iteration', i, 'has a total accuracy of:', accuracy)
    total_loss = 0
    accuracy = 0

Iteration 0 has a total loss of: 5.117593580408551
Iteration 0 has a total accuracy of: 0.25
Iteration 100 has a total loss of: 0.5
Iteration 100 has a total accuracy of: 0.5
Iteration 200 has a total loss of: 0.5
Iteration 200 has a total accuracy of: 0.5
Iteration 300 has a total loss of: 0.5
Iteration 300 has a total accuracy of: 0.5
Iteration 400 has a total loss of: 0.5
Iteration 400 has a total accuracy of: 0.5
Iteration 500 has a total loss of: 0.5
Iteration 500 has a total accuracy of: 0.5
Iteration 600 has a total loss of: 0.5
Iteration 600 has a total accuracy of: 0.5
Iteration 700 has a total loss of: 0.5
Iteration 700 has a total accuracy of: 0.5
Iteration 800 has a total loss of: 0.5
Iteration 800 has a total accuracy of: 0.5
Iteration 900 has a total loss of: 0.5
Iteration 900 has a total accuracy of: 0.5
Iteration 1000 has a total loss of: 0.5
Iteration 1000 has a total accuracy of: 0.5
Iteration 1100 has a total loss of: 0.5
Iteration 1100 has a total accuracy of: 0.5
I

And a learning rate to high neither... It's kind of 'jumping' a lot between values in the function and not learning anything that's why we get basically the average of guessing if it was going to be a 1 or a 0.

In [11]:
targets = ['mnist__weights.npz', 'Random Shifted', 'Random']

npzfile = np.load('mnist__weights.npz')

x = npzfile['x']
y = npzfile['y']

for target in targets:
    
    print('\nWorking with', target, '........... \n')

    if target == 'mnist__weights.npz':
        
        b1 = npzfile['l1_bias']
        b2 = npzfile['l2_bias']
        b3 = npzfile['l3_bias']
        
        w1 = npzfile['l1_weights']
        w2 = npzfile['l2_weights']
        w3 = npzfile['l3_weights']
    
    elif target == 'Random Shifted':
        
        b1 = npzfile['l1_bias']
        b2 = npzfile['l2_bias']
        b3 = npzfile['l3_bias']
        
        w1 = npzfile['l1_weights'] + 2 * np.random.random_sample((b1.shape[0], x.shape[1])) - 1
        w2 = npzfile['l2_weights'] + 2 * np.random.random_sample((b2.shape[0], b1.shape[0])) - 1
        w3 = npzfile['l3_weights'] + 2 * np.random.random_sample((b3.shape[0], b2.shape[0])) - 1
    
    elif target == 'Random':
        
        b1 = np.random.normal(3, 1, size=(128,))
        b2 = np.random.normal(3, 1, size=(64,))
        b3 = np.random.normal(3, 1, size=(10,))
        
        w1 = npzfile['l1_weights'] + 2 * np.random.random_sample((b1.shape[0], x.shape[1])) - 1
        w2 = npzfile['l2_weights'] + 2 * np.random.random_sample((b2.shape[0], b1.shape[0])) - 1
        w3 = npzfile['l3_weights'] + 2 * np.random.random_sample((b3.shape[0], b2.shape[0])) - 1
    
    epochs = 50

    lr = 0.1
    
    total_loss = 0
    accuracy = 0
    
    layer1 = Layer(w1, b1, activation_func=Sigmoid)
    layer2 = Layer(w2, b2, activation_func=Sigmoid)
    layer3 = Layer(w3, b3, activation_func=Sigmoid)

    net = Network([layer1, layer2, layer3])
    
    for i in range(epochs):

        for j in range(x.shape[0]):
            
            obs = j
            y_pred_prob = np.array(net.forward(x[obs]))
            y_true = np.array([1 if y[obs] == i else 0 for i in range(y_pred_prob.shape[0])])
            
            net.backwards(MSE.loss_(y_true, y_pred_prob))
            
            for layer in net.layers:
                layer.weights -= layer.weights_grad * lr
                layer.biases -= layer.bias_grad * lr
            
            y_pred = y_pred_prob.argmax()
            y_true_pos = y_true.argmax()
    
            if y_pred == y_true_pos:
                acc = 1
            else:
                acc = 0
        
            accuracy += acc / x.shape[0]
    
            total_loss += MSE.loss(y_true, y_pred_prob) / x.shape[0]

    
        if i % 10 == 0:
            print('Iteration', i, 'has a total loss of:', total_loss)
            print('Iteration', i, 'has a total accuracy of:', accuracy)
        total_loss = 0
        accuracy = 0


Working with mnist__weights.npz ........... 

Iteration 0 has a total loss of: 0.007209368311541007
Iteration 0 has a total accuracy of: 0.9820000000000008
Iteration 10 has a total loss of: 0.0008513732864818982
Iteration 10 has a total accuracy of: 0.9960000000000008
Iteration 20 has a total loss of: 0.0004006761755174906
Iteration 20 has a total accuracy of: 0.9980000000000008
Iteration 30 has a total loss of: 0.00031838605855695186
Iteration 30 has a total accuracy of: 0.9980000000000008
Iteration 40 has a total loss of: 0.0002834493680628709
Iteration 40 has a total accuracy of: 0.9980000000000008

Working with Random Shifted ........... 

Iteration 0 has a total loss of: 0.03865353767544305
Iteration 0 has a total accuracy of: 0.8090000000000006
Iteration 10 has a total loss of: 0.0024124655344902085
Iteration 10 has a total accuracy of: 0.9860000000000008
Iteration 20 has a total loss of: 0.001496247598081146
Iteration 20 has a total accuracy of: 0.9880000000000008
Iteration 30 

Nice! we can clearly see that the pre trained weights gave the best accuracy after 50 epochs of 99.8%!. In the overall we can see how the model improves while adjusting the weights and biases based on the gradients!