In [33]:
import random as random
import numpy as np

class Layer:
    def __init__(self, currentLayerLen):
        self.activationVector = np.array([random.uniform(0, 0.1) for i in range(currentLayerLen)])
        self.biasVector = np.array([random.uniform(-1.0, 0.0) for i in range(currentLayerLen)])
        #self.biasVector = np.zeros(currentLayerLen)
        self.adjBiasVector = np.zeros(currentLayerLen)
        self.zVector = np.zeros(shape = (currentLayerLen, 1))
        self.errorVector = np.zeros(shape = (currentLayerLen, 1))
        self.size = currentLayerLen

    def resetLayer(self):
        self.zVector = np.zeros(self.zVector.size)
        self.errorVector = np.zeros(self.errorVector.size)

    def resetAdjBiasVector(self):
        self.adjBiasVector = np.zeros(self.biasVector.size)

    def cout(self):
        print("Activations: ")
        print(self.activationVector)
        print("Biases: ")
        print(self.biasVector)
        print("Z Vector: ")
        print(self.zVector)
        print("Error: ")
        print(self.errorVector)

    def coutBase(self):
        print("Activations: ")
        print(self.activationVector)
        print("Biases: ")
        print(self.biasVector)

In [34]:
class weightMatrix:
    def __init__(self, prevLayerLen, nextLayerLen):
        self.matrix = np.random.rand(nextLayerLen, prevLayerLen) * 0.001
        self.adjMatrix = np.zeros((nextLayerLen, prevLayerLen))

    def resetAdjMatrix(self):
        self.adjMatrix = np.zeros((self.matrix.shape))

    def cout(self):
        print(self.matrix)

In [35]:
def sigmoid(x):
    # print("Sigmoid called with x = ", str(x))
    if x <= -700:
        x = -700
    return 1 / (1 + np.exp(-x))

def sigmoidDeriv(x):
    # print("Sigmoid deriv called with x = ", str(x))
    if x <= -350:
        x = -350
    return np.exp(-x) / ((1 + np.exp(-x)) * (1 + np.exp(-x)))

class Network:

    def __init__(self, start, first, second, end):
        startLayer = Layer(start)
        firstLayer = Layer(first)
        secondLayer = Layer(second)
        endLayer = Layer(end)

        self.Layers = np.array([startLayer, firstLayer, secondLayer, endLayer])

        firstMatrix = weightMatrix(start, first)
        secondMatrix = weightMatrix(first, second)
        endMatrix = weightMatrix(second, end)

        # Indexed with the layer before the matrix
        self.Matrices = np.array([firstMatrix, secondMatrix, endMatrix])

    def calculateZVector(self, layerIdx):
        # self.Layers[layerIdx - 1] is previous Layer
        # currLayer = self.Layers[layerIdx] is current Layer
        # self.Matrices[layerIdx - 1].matrix is weight matrix related to this calculation
        # self.Layers[layerIdx - 1].activationVector is previous activation
        # self.Layers[layerIdx].biasVector is current Bias

        # Weight matrix * previous activation vector
        self.Layers[layerIdx].zVector = np.dot(self.Matrices[layerIdx - 1].matrix, self.Layers[layerIdx - 1].activationVector)
        # += current bias vector
        self.Layers[layerIdx].zVector += self.Layers[layerIdx].biasVector

    def forwardPropagationStep(self, layerIdx):
        # currLayer = self.Layers[layerIdx] is current Layer

        self.calculateZVector(layerIdx)

        for i in range(self.Layers[layerIdx].size):
            self.Layers[layerIdx].activationVector[i] = sigmoid(self.Layers[layerIdx].zVector[i])

    # Assumes that data is between 0 and 255 value
    def setStartLayerActivations(self, dataset):
        # # FOR 4 3 3 2 test

        # for idx in range(0, 4):
        #     self.Layers[0].activationVector[idx] = dataset[idx]

        #
        # FOR MINST
        #

        # self.Layers[0] is start Layer

        if len(dataset) * len(dataset[0]) != self.Layers[0].size:
            print("There is a mismatch between the size of the input data and the start layer!")
            print("Size of dataset is: " + str(len(dataset) * len(dataset[0])))
            print("Size of first layer is: " + str(self.Layers[0].size))

        #print("1) Set the activations of the first layer")

        layerIdx = 0
        for row in range(0, len(dataset)):
            for col in range(0, len(dataset[0])):
                self.Layers[0].activationVector[layerIdx] = dataset[row][col] / 255
                layerIdx += 1

    def fullForwardPropagation(self, target):
        #print("2) Feedforward: Compute all activations for all layers")
        self.forwardPropagationStep(1)
        self.forwardPropagationStep(2)
        self.forwardPropagationStep(3)

        #print("Target is: " + str(target))
        #print("Cost is: " + str(self.cost(target)))
    
    def cost(self, target):
        # self.Layers[self.Layers.size - 1] is end Layer

        sum = 0

        for i in range(self.Layers[self.Layers.size - 1].size):
            if (i == target):
                sum += (1.0 - self.Layers[self.Layers.size - 1].activationVector[i]) * (1.0 - self.Layers[self.Layers.size - 1].activationVector[i])
            else:
                sum += (self.Layers[self.Layers.size - 1].activationVector[i]) * (self.Layers[self.Layers.size - 1].activationVector[i])
        return sum / (self.Layers[self.Layers.size - 1].size)
    
    def fullBackwardPropagation(self, target):
        #print("3) Output Error in last layer")
        self.calculateErrorInLastLayerForTarget(target)

        #print("4) Backpropagate error: calculate error for all layers")
        self.calculateErrorFromNextLayerError(2)
        self.calculateErrorFromNextLayerError(1)
    
    # The cost function is hard coded
    def calculateErrorInLastLayerForTarget(self, target):
        # self.Layers[self.Layers.size - 1] is end Layer

        # The desired value for target is 1
        CGradient = np.zeros(self.Layers[self.Layers.size - 1].size)
        for i in range(self.Layers[self.Layers.size - 1].size):
            CGradient[i] = self.Layers[self.Layers.size - 1].activationVector[i]
            if i == target:
                CGradient[i] -= 1.0

        # Apply sigmoid' to endLayer.zVector in place
        for i in range(self.Layers[self.Layers.size - 1].size):
            self.Layers[self.Layers.size - 1].zVector[i] = sigmoidDeriv(self.Layers[self.Layers.size - 1].zVector[i])

        # sigmoid' was applied to endLayer.zVector in place
        self.Layers[self.Layers.size - 1].errorVector = np.multiply(CGradient, self.Layers[self.Layers.size - 1].zVector)

    # Assumes error in next layer is up to date
    def calculateErrorFromNextLayerError(self, layerIdx):
        # self.Layers[layerIdx] is current Layer
        # self.Layers[layerIdx + 1] is next Layer

        # self.Matrices[layerIdx].matrix.transpose() is the transposed weight matrix
        # np.dot(transposedWeightMatrix, self.Layers[layerIdx + 1].errorVector) is transposed weight matrix * next layer error

        # Apply sigmoid' to currLayer.zVector in place
        for i in range(self.Layers[layerIdx].size):
            self.Layers[layerIdx].zVector[i] = sigmoidDeriv(self.Layers[layerIdx].zVector[i])

        # sigmoid' was applied to self.Layers[self.Layers.size - 1].zVector in place
        # error = (transposed weight matrix * next layer error) o sigmoid'(z)
        # Where o is index by index multiplication
        self.Layers[layerIdx].errorVector = np.multiply(np.dot(self.Matrices[layerIdx].matrix.transpose(), self.Layers[layerIdx + 1].errorVector), self.Layers[layerIdx].zVector)

    # All of this would be MUCH easier if W and B were stored as matrices and vectors
    def adjustBasedOnGradientDescentForCurrentExample(self, learningRate, numberInBatch):
        #print("5) Gradient Descent")

        # npm stands for n per m
        # where: n: learning rate
        #        m: number of train examples in batch
        npm = learningRate / numberInBatch

        # Adjust biases
        for layerIdx in range(1, self.Layers.size):
            # self.Layers[layerIdx] is current Layer

            #self.Layers[layerIdx].biasVector -= npm * self.Layers[layerIdx].errorVector
            #print("Delta is (bias): ", -npm * self.Layers[layerIdx].errorVector)

            self.Layers[layerIdx].adjBiasVector += npm * self.Layers[layerIdx].errorVector

        # # Adjust weights
        for weightMatrixIdx in range(self.Matrices.size):
            # self.Matrices[weightMatrixIdx].matrix is current weight matrix
            # self.Layers[weightMatrixIdx + 1] is current Layer
            # self.Layers[weightMatrixIdx] is previous Layer

            # For matrix mupltiplications the vectors need to be 2D
            # This is how we make them 2D
            # np.array([self.Layers[weightMatrixIdx].activationVector]) is previous activations
            # np.array([self.Layers[weightMatrixIdx + 1].errorVector]) is current error
            # Temp variable to make code more understandable
            
            # weight matrix -= (current Error)T * previous activation
            #self.Matrices[weightMatrixIdx].matrix -= npm * np.dot(np.array([self.Layers[weightMatrixIdx + 1].errorVector]).transpose(), np.array([self.Layers[weightMatrixIdx].activationVector]))
            #print("Delta is (weight): ", -npm * np.dot(np.array([self.Layers[weightMatrixIdx + 1].errorVector]).transpose(), np.array([self.Layers[weightMatrixIdx].activationVector])))

            self.Matrices[weightMatrixIdx].adjMatrix += npm * np.dot(np.array([self.Layers[weightMatrixIdx + 1].errorVector]).transpose(), np.array([self.Layers[weightMatrixIdx].activationVector]))

    def resetNetwork(self):
        for layer in self.Layers:
            layer.resetLayer()

    def resetAdjs(self):
        for layer in self.Layers:
            layer.resetAdjBiasVector()

        for matrix in self.Matrices:
            matrix.resetAdjMatrix()

    def adjustWithAdjustVariables(self):
        for layerIdx in range(1, self.Layers.size):
            self.Layers[layerIdx].biasVector -= self.Layers[layerIdx].adjBiasVector

        for weightMatrixIdx in range(self.Matrices.size):
            self.Matrices[weightMatrixIdx].matrix -= self.Matrices[weightMatrixIdx].adjMatrix

    def trainBatch(self, data, labels, learningRate):
        # Check if there is a length mismatch
        if (len(data) != len(labels)):
            print("There is a mismatch between the length of the data and lables")
            print("Length of data is: " + str(data.size))
            print("Length of labels are: " + str(labels.size))
        numberInBatch = len(data)

        self.resetAdjs()

        for idx in range(numberInBatch):
            # Steps of one training 
            # self.cout()
            # print("\n\nRESET NETWORK\n\n")
            self.resetNetwork()
            # self.cout()
            # print("\n\nSET START LAYER ACTIVATIONS\n\n")
            self.setStartLayerActivations(data[idx])
            # self.cout()
            # print("\n\nFULL FORWARD PROPAGATION\n\n")
            self.fullForwardPropagation(labels[idx])
            # self.cout()
            # print("\n\nFULL BACKWARD PROPAGATION\n\n")
            self.fullBackwardPropagation(labels[idx])
            # self.cout()
            # print("\n\nADJUST BASED ON GRADIENT DESCENT FOR CURRENT EXAMPLE\n\n")
            self.adjustBasedOnGradientDescentForCurrentExample(learningRate, numberInBatch)
        #     self.cout()

        # print("\n\nADJUST WITH ADJUST VARIABLES\n\n")
        self.adjustWithAdjustVariables()
        # self.cout()

    def findPrediction(self):
        endLayerActivations = self.Layers[self.Layers.size - 1].activationVector

        max = 0
        maxIdx = 11

        for idx in range(endLayerActivations.size):
            if endLayerActivations[idx] > max:
                max = endLayerActivations[idx]
                maxIdx = idx

        return maxIdx

    def test(self, data, labels):
        # Check if there is a length mismatch
        if (len(data) != len(labels)):
            print("There is a mismatch between the length of the data and lables")
            print("Length of data is: " + str(data.size))
            print("Length of labels are: " + str(labels.size))

        numberOfTest = len(data)
        sumCost = 0
        correct = 0

        for idx in range(numberOfTest):
            self.setStartLayerActivations(data[idx])
            self.fullForwardPropagation(labels[idx])

            sumCost += self.cost(labels[idx])

            correctIdx = self.findPrediction()
            if correctIdx == labels[idx]:
                correct += 1

        print("Average cost is: ", sumCost / numberOfTest)
        print("Percentage of correct is: ", correct / numberOfTest)

    def checkRandomExamples(self, data, labels):
        numberOfData = len(data)

        for num in range (0, 20):
            randIdx = random.randint(0, numberOfData)
            
            self.setStartLayerActivations(data[randIdx])
            self.fullForwardPropagation(labels[randIdx])

            print("Label for the data is: " + str(labels[randIdx]))
            self.coutLastLayer()
            print("The cost is: " + str(self.cost(labels[randIdx])))

    def coutLastLayer(self):
        print("The last layer activations are: ")
        print(self.Layers[self.Layers.size - 1].activationVector)

    def coutActivation(self):
        for idx in range(self.Layers.size):
            print("Layer: " + str(idx))
            print(self.Layers[idx].activationVector)

    def coutBase(self):
        for i in range(self.Matrices.size):
            print("Layer: ", i)
            self.Layers[i].coutBase()
            print()
            self.Matrices[i].cout()
            print()

        print("Layer: ", self.Layers.size - 1)
        self.Layers[self.Layers.size - 1].coutBase()

    def cout(self):
        for i in range(self.Matrices.size):
            print("Layer: ", i)
            self.Layers[i].cout()
            print()
            self.Matrices[i].cout()
            print()

        print("Layer: ", self.Layers.size - 1)
        self.Layers[self.Layers.size - 1].cout()

In [None]:
# MNIST Dataset: 28 x 28 = 784
import pickle

with open("network.pickle", "rb") as infile:
    net = pickle.load(infile)

# net = Network(4, 3, 3, 2)
# net = Network(784, 16, 16, 10)

# Network is loaded into net

training = True
testing = False

if training:
    # #
    # # 4 3 3 2 Test run
    # #

    # dummyData = [[0.125, 0.25, 0.675, 0.885]]
    # dummyLabel = [1]

    # for i in range(100):
    #     net.trainBatch(dummyData, dummyLabel, 5)
    #     net.coutLastLayer()
    
    # net.test(dummyData, dummyLabel)

    # #
    # # Training on the actual data
    # #

    with open("dataset/pickled/data_batch_01.pickle", "rb") as infile:
        data_batch_01 = pickle.load(infile)
    with open("dataset/pickled/label_batch_01.pickle", "rb") as infile:
        label_batch_01 = pickle.load(infile)
    with open("dataset/pickled/data_batch_02.pickle", "rb") as infile:
        data_batch_02 = pickle.load(infile)
    with open("dataset/pickled/label_batch_02.pickle", "rb") as infile:
        label_batch_02 = pickle.load(infile)
    with open("dataset/pickled/data_batch_03.pickle", "rb") as infile:
        data_batch_03 = pickle.load(infile)
    with open("dataset/pickled/label_batch_03.pickle", "rb") as infile:
        label_batch_03 = pickle.load(infile)
    with open("dataset/pickled/data_batch_04.pickle", "rb") as infile:
        data_batch_04 = pickle.load(infile)
    with open("dataset/pickled/label_batch_04.pickle", "rb") as infile:
        label_batch_04 = pickle.load(infile)
    with open("dataset/pickled/data_batch_05.pickle", "rb") as infile:
        data_batch_05 = pickle.load(infile)
    with open("dataset/pickled/label_batch_05.pickle", "rb") as infile:
        label_batch_05 = pickle.load(infile)
    with open("dataset/pickled/data_batch_01.pickle", "rb") as infile:
        data_batch_06 = pickle.load(infile)
    with open("dataset/pickled/label_batch_01.pickle", "rb") as infile:
        label_batch_06 = pickle.load(infile)
    with open("dataset/pickled/data_batch_02.pickle", "rb") as infile:
        data_batch_07 = pickle.load(infile)
    with open("dataset/pickled/label_batch_02.pickle", "rb") as infile:
        label_batch_07 = pickle.load(infile)
    with open("dataset/pickled/data_batch_03.pickle", "rb") as infile:
        data_batch_08 = pickle.load(infile)
    with open("dataset/pickled/label_batch_03.pickle", "rb") as infile:
        label_batch_08 = pickle.load(infile)
    with open("dataset/pickled/data_batch_04.pickle", "rb") as infile:
        data_batch_09 = pickle.load(infile)
    with open("dataset/pickled/label_batch_04.pickle", "rb") as infile:
        label_batch_09 = pickle.load(infile)
    with open("dataset/pickled/data_batch_05.pickle", "rb") as infile:
        data_batch_10 = pickle.load(infile)
    with open("dataset/pickled/label_batch_05.pickle", "rb") as infile:
        label_batch_10 = pickle.load(infile)

    # with open("dataset/pickled/data_batch_first_100.pickle", "rb") as infile:
    #     data_batch_first_100 = pickle.load(infile)
    # with open("dataset/pickled/label_batch_first_100.pickle", "rb") as infile:
    #     label_batch_first_100 = pickle.load(infile)

    with open("dataset/pickled/data_test.pickle", "rb") as infile:
        data_test = pickle.load(infile)
    with open("dataset/pickled/label_test.pickle", "rb") as infile:
        label_test = pickle.load(infile)

    # print("Testing on batch data:")
    # net.test(data_batch_03, label_batch_03)
    print("Testing on test data:")
    net.test(data_test, label_test)

    print("Testing on the batch")
    for i in range(200):
        # Number in one batch is 6000 so a learning rate of 6000 will result in an npm of 1
        net.trainBatch(data_batch_04, label_batch_04, 60)
        net.trainBatch(data_batch_05, label_batch_05, 60)
        net.trainBatch(data_batch_06, label_batch_06, 60)
        net.trainBatch(data_batch_07, label_batch_07, 60)
        net.trainBatch(data_batch_08, label_batch_08, 60)
        net.trainBatch(data_batch_09, label_batch_09, 60)
        net.trainBatch(data_batch_10, label_batch_10, 60)
        # net.cout()
        
        # net.test(data_batch_01, label_batch_01)
        #net.test(data_test, label_test)

    net.checkRandomExamples(data_test, label_test)

    # print("Testing on batch data:")
    # net.test(data_batch_03, label_batch_03)
    print("Testing on test data:")
    net.test(data_test, label_test)

    with open("network.pickle", "wb") as outfile:
        pickle.dump(net, outfile)

    # #
    # #
    # ### CHECKING
    # #
    # #

    # 

if testing:
    with open("dataset/pickled/data_test.pickle", "rb") as infile:
        data_test = pickle.load(infile)
    with open("dataset/pickled/label_test.pickle", "rb") as infile:
        label_test = pickle.load(infile)

    net.test(data_test, label_test)
    net.checkRandomExamples(data_test, label_test)

Testing on batch data:
Average cost is:  0.019669833372326577
Percentage of correct is:  0.8793333333333333
Testing on test data:
Average cost is:  0.016305933312822218
Percentage of correct is:  0.9008
Testing on the batch
The last layer activations are: 
[4.25980119e-06 2.30153434e-02 7.99556544e-05 9.00530041e-04
 2.43848869e-06 6.59557883e-03 2.30482661e-08 1.72826996e-06
 9.34473885e-01 1.40036373e-02]
The last layer activations are: 
[2.69993413e-06 3.07536928e-02 7.80529850e-05 5.88404352e-04
 2.72870747e-06 3.37243026e-03 2.10981981e-08 1.79733351e-06
 9.00874040e-01 1.52074325e-02]
The last layer activations are: 
[3.49313194e-06 2.87423347e-02 7.18635373e-05 8.47456991e-04
 2.29221397e-06 3.70112753e-03 2.22587301e-08 1.57100628e-06
 8.84751025e-01 1.32704532e-02]
The last layer activations are: 
[2.73840425e-06 3.40509324e-02 6.39645531e-05 6.20050116e-04
 2.32697291e-06 2.49653104e-03 1.99868256e-08 1.50669283e-06
 8.91962997e-01 1.38010609e-02]
The last layer activations a