#Based on "Neural Networks and Deep Learning" Course by Deeplearning.ai

In [None]:
# 2 Layer NN(ReLU, Softmax) with no batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_2():
    def __init__(self, n0, n1, n2):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS):
        for iter in range(EPOCHS + 1):
            # layer1: relu, layer2: softmax, cost: cross-entropy
            z1 = self.linear_hypo(self.weight1, features, self.bias1)
            layer1 = self.hypothesis('relu', z1)
            z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
            layer2 = self.hypothesis('softmax', z2)
            # gradient
            dz2 = layer2 - labels
            dw2 = np.matmul(dz2, layer1.T)
            db2 = np.average(dz2, axis=1)

            dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0)) # relu gradient: if x > 0: 1 else: 0 => np.where(z1 > 0, 1, 0)
            dw1 = np.matmul(dz1, features.T)
            db1 = np.average(dz1, axis=1)

            self.weight2 = self.weight2 - dw2 * learning_rate
            self.weight1 = self.weight1 - dw1 * learning_rate
            self.bias2 = self.bias2 - db2 * learning_rate
            self.bias1 = self.bias1 - db1 * learning_rate

            error = self.cost(labels, layer2)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.001, 100)


iter:    0 error:     2.3499
iter:   10 error:     2.3504
iter:   20 error:     2.3503
iter:   30 error:     2.3502
iter:   40 error:     2.3501
iter:   50 error:     2.3500
iter:   60 error:     2.3499
iter:   70 error:     2.3499
iter:   80 error:     2.3498
iter:   90 error:     2.3497
iter:  100 error:     2.3496


In [None]:
# 2 Layer NN(ReLU, Softmax) with batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_2():
    def __init__(self, n0, n1, n2):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0) 
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS, batch_size):
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(features[0]) / batch_size)):
                features_batch = features[:, batch * batch_size : (batch + 1) * batch_size]
                labels_batch = labels[:, batch * batch_size : (batch + 1) * batch_size]
                # layer1: relu, layer2: softmax, cost: cross-entropy
                z1 = self.linear_hypo(self.weight1, features_batch, self.bias1)
                layer1 = self.hypothesis('relu', z1)
                z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
                layer2 = self.hypothesis('softmax', z2)
                # gradient
                dz2 = layer2 - labels_batch #this is same result when you use sigmoid and sigmoid's cost function(y*log(hypo) + (1-y)*log(1-hypo))
                dw2 = np.matmul(dz2, layer1.T)
                db2 = np.average(dz2, axis=1)

                dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0)) # relu gradient: if x > 0: 1 else: 0 => np.where(z1 > 0, 1, 0)
                dw1 = np.matmul(dz1, features_batch.T)
                db1 = np.average(dz1, axis=1)

                self.weight2 = self.weight2 - dw2 * learning_rate
                self.weight1 = self.weight1 - dw1 * learning_rate
                self.bias2 = self.bias2 - db2 * learning_rate
                self.bias1 = self.bias1 - db1 * learning_rate

                error = self.cost(labels_batch, layer2)
            #if iter % (EPOCHS / 10) == 0:
            #    print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, features, labels):
        hypothesis = self.hypothesis('softmax', self.linear_hypo(self.weight2, self.hypothesis('relu', self.linear_hypo(self.weight1, features, self.bias1)), self.bias2))
        prob = np.average((np.argmax(hypothesis, axis=0) == np.argmax(labels, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
model = NeuralNetwork_2(len(images_train), 100, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Test Accuracy:    96.5800


###Find best setting of hyper-parameters evaluated by Test Accuracy

In [None]:
# Test Accuracy comparison with changing learning_rate - 0.0001 is the best
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.1, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.01, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.00001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.000001, 100, 50)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    10.2800
Test Accuracy:    86.4100
Test Accuracy:    88.7300
Test Accuracy:    89.4500
Test Accuracy:    86.4200
Test Accuracy:    12.1900


In [None]:
# Test Accuracy comparison with changing batch_size - 100 is the best
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 10)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 1000)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    89.5300
Test Accuracy:    89.5200
Test Accuracy:    89.7100
Test Accuracy:    89.6400


In [None]:
# Test Accuracy comparison with changing number of hidden layer nodes - 100 is the best
model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 50, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 100, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 1000, 10) # too much time needed to calculate
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    95.2200
Test Accuracy:    95.9300
Test Accuracy:    96.3200
Test Accuracy:    95.4800


In [None]:
# Test Accuracy comparison with changing EPOCHS - as many as possible
# 100 nodes of hidden layer's calculation takes too much time so decreased it to 20
model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 10, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 1000, 100)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    90.9200
Test Accuracy:    94.8000
Test Accuracy:    95.8800


###3 Layer NN with batch

In [None]:
# 3 Layer NN(ReLU, Softmax) with batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_3():
    def __init__(self, n0, n1, n2, n3):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.weight3 = np.random.rand(n3, n2) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
        self.bias3 = np.random.rand(n3)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS, batch_size):
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(features[0]) / batch_size)):
                features_batch = features[:, batch * batch_size : (batch + 1) * batch_size]
                labels_batch = labels[:, batch * batch_size : (batch + 1) * batch_size]
                # layer1: relu, layer2: relu, layer3: softmax, cost: cross-entropy
                z1 = self.linear_hypo(self.weight1, features_batch, self.bias1)
                layer1 = self.hypothesis('relu', z1)
                z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
                layer2 = self.hypothesis('relu', z2)
                z3 = self.linear_hypo(self.weight3, layer2, self.bias3)
                layer3 = self.hypothesis('softmax', z3)
                # gradient
                dz3 = layer3 - labels_batch
                dw3 = np.matmul(dz3, layer2.T)
                db3 = np.average(dz3, axis=1)

                dz2 = np.multiply(np.matmul(self.weight3.T, dz3), np.where(z2 > 0, 1, 0))
                dw2 = np.matmul(dz2, layer1.T)
                db2 = np.average(dz2, axis=1)

                dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0))
                dw1 = np.matmul(dz1, features_batch.T)
                db1 = np.average(dz1, axis=1)

                self.weight3 = self.weight3 - dw3 * learning_rate
                self.weight2 = self.weight2 - dw2 * learning_rate
                self.weight1 = self.weight1 - dw1 * learning_rate
                self.bias3 = self.bias3 - db3 * learning_rate
                self.bias2 = self.bias2 - db2 * learning_rate
                self.bias1 = self.bias1 - db1 * learning_rate

                error = self.cost(labels_batch, layer3)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, features, labels):
        layer1 = self.linear_hypo(self.weight1, features, self.bias1)
        layer1 = self.hypothesis('relu', layer1)
        layer2 = self.linear_hypo(self.weight2, layer1, self.bias2)
        layer2 = self.hypothesis('relu', layer2)
        layer3 = self.linear_hypo(self.weight3, layer2, self.bias3)
        hypothesis = self.hypothesis('softmax', layer3)
        prob = np.average((np.argmax(hypothesis, axis=0) == np.argmax(labels, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
model = NeuralNetwork_3(len(images_train), 100, 100, len(labels_train))
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
iter:    0 error:     2.2831
iter:   10 error:     1.0475
iter:   20 error:     0.4677
iter:   30 error:     0.2965
iter:   40 error:     0.2318
iter:   50 error:     0.2020
iter:   60 error:     0.1896
iter:   70 error:     0.1801
iter:   80 error:     0.1705
iter:   90 error:     0.1593
iter:  100 error:     0.1515
Test Accuracy:    96.3000


#Multi Layers

###Try1 => Failed(Error in Backprop)

In [None]:
# x => z = wx + b => a = activation_func(z)
# a_l-1 => z_l = w_l * a_l-1 + b_l => a_l = activation_func_l(z_l)
# da <=> dJ / da
# J <=> cost function
# reference of gradient of cross entropy : https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_minimization => Cross-entropy loss function and logistic regression
# => d/db * L(b) = X(Y^ - Y) <==> dw = x * (hypo - label) when J is a cross-entropy function.
import numpy as np

class Func():
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    def relu(z):
        return np.maximum(z, 0)
    def softmax(z): #same column: one example => sum(axis=0): sum of all values of one example and divide each classes of one example by this sum
        return np.exp(z) / np.exp(z).sum(axis=0) #sum(axis=0) => add same column numbers and so #rows become 1
    def softmax_normalized(z):
        #since softmax has exp and exp can make numbers too large very easily and it causes overflow in caculation
        z = (z - z.min()) / (z.max() - z.min())
        #print("z:", z, "softmax:", np.exp(z) / np.exp(z).sum(axis=0))
        return np.exp(z) / np.exp(z).sum(axis=0)

class Layer():
    def __init__(self, input_size, output_size, act_func):
        self.weight = np.random.rand(output_size, input_size) * 0.01
        self.bias = np.random.rand(output_size, 1) * 0.01
        self.act_func = act_func
        self.z = None # result of linear
        self.a = None # result of act_func(linear)
        self.dz = None # equals dJ/dz
    def linear(self, input):
        self.z = np.matmul(self.weight, input) + self.bias #size: (n1, n0) * (n0, m) + (broadcasting)(n1, 1) => (n1, m)
        return self.z
    def activate_func(self, input):
        if self.act_func == 'sigmoid':
            self.a = Func.sigmoid(input)
        elif self.act_func == 'relu':
            self.a = Func.relu(input)
        elif self.act_func == 'softmax':
            self.a = Func.softmax_normalized(input)
        return self.a
    def output(self, input):
        return self.forward(input)
    def forward(self, input):
        return self.activate_func(self.linear(input))
    def backward(self, da_next, a_before, lr): #next: next layer(outputlayer if this is hidden in 3layerNN), before: before layer(inputlayer if this is hidden in 3layerNN)
        # dw_l = da_l * (depending on layer_l's activation func) * a_l-1
        # da_l * (dependin on ) part equals dz_l
        # if act_func is softmax, dz_l part equals (a_l - y) and so dw_l = matmul((a_l - y), a_l-1.T)
        # if act_func is relu, (depending on ) part means a matrix composed of 0 or 1 (if z_l is non-zero, 1 and the others are 0)
        # if act_func is relu, da_l equals matmul(dw_l+1.T, dz_l+1)
        if self.act_func == 'relu':
            dz = np.multiply(da_next, np.where(self.z > 0, 1, 0))
        else: #sigmoid or softmax
            dz = da_next
        self.dz = dz
        dw = np.matmul(dz, a_before.T)
        db = np.reshape(np.average(dz, axis=1), (len(dz), 1))

        self.weight = self.weight - lr * dw
        self.db = self.bias - lr * db

class NeuralNetwork():
    def __init__(self, inputUnits, outputUnits, hiddenUnits, hiddenLayers, activation_func):
        self.Layers = [Layer(inputUnits, hiddenUnits, activation_func)]
        for i in range(hiddenLayers):
            self.Layers.append(Layer(hiddenUnits, hiddenUnits, activation_func))
        self.Layers.append(Layer(hiddenUnits, outputUnits, 'softmax'))

    def cost(self, hypo, label):
        #cross entropy
        total = np.multiply(label, np.log(hypo)).sum(axis=0)
        return -np.average(total)

    def train(self, input_, output_, batch_size, lr, EPOCHS):
        for iter in range(EPOCHS + 1):
            for i in range(int(len(input_[0]) / batch_size)):
                input = input_[:, i * batch_size : (i + 1) * batch_size]
                output = output_[:, i * batch_size : (i + 1) * batch_size]

                #forward propagation
                self.Layers[0].forward(input)
                for idx in range(1, len(self.Layers)):
                    self.Layers[idx].forward(self.Layers[idx - 1].a)
                    if  np.isnan(self.Layers[idx].a).any() or np.isinf(self.Layers[idx].a).any():
                        print("Forward")
                        print("Layer idx:", idx, "batch:", i, "iter:", iter, "weight0:", self.Layers[0].weight, "weight1:", self.Layers[1].weight, "weight2:", self.Layers[2].weight)
                        print("a[0]:", self.Layers[0].a, "a[1]:", self.Layers[1].a, "a[2]:", self.Layers[2].a)
                        print("dz[0]:", self.Layers[0].dz, "dz[1]:", self.Layers[1].dz, "dz[2]:", self.Layers[2].dz)
                        raise ValueError()
                        return None

                #Back propagation
                idx = len(self.Layers) - 1
                self.Layers[-1].backward(self.Layers[-1].a - output, self.Layers[-2].a, lr)
                if np.isnan(self.Layers[-1].weight).any() or np.isinf(self.Layers[-1].weight).any():
                    print("Backprop")
                    print("Layer idx:", idx, "batch:", i, "iter:", iter, "weight0:", self.Layers[0].weight, "weight1:", self.Layers[1].weight, "weight2:", self.Layers[2].weight)
                    print("a[0]:", self.Layers[0].a, "a[1]:", self.Layers[1].a, "a[2]:", self.Layers[2].a)
                    print("dz[0]:", self.Layers[0].dz, "dz[1]:", self.Layers[1].dz, "dz[2]:", self.Layers[2].dz)
                    raise ValueError()
                idx = len(self.Layers) - 2
                while idx > 0:
                    temp = np.matmul(self.Layers[idx + 1].weight.T, self.Layers[idx + 1].dz)
                    self.Layers[idx].backward(temp, self.Layers[idx - 1].a, lr)
                    if np.isnan(self.Layers[idx].weight).any() or np.isinf(self.Layers[idx].weight).any():
                        print("Backprop")
                        print("Layer idx:", idx, "batch:", i, "iter:", iter, "weight0:", self.Layers[0].weight, "weight1:", self.Layers[1].weight, "weight2:", self.Layers[2].weight)
                        print("a[0]:", self.Layers[0].a, "a[1]:", self.Layers[1].a, "a[2]:", self.Layers[2].a)
                        print("dz[0]:", self.Layers[0].dz, "dz[1]:", self.Layers[1].dz, "dz[2]:", self.Layers[2].dz)
                        raise ValueError()
                    idx = idx - 1
                    
                temp = np.matmul(self.Layers[1].weight.T, self.Layers[1].dz)
                self.Layers[0].backward(temp, input, lr)
                if np.isnan(self.Layers[0].weight).any() or np.isinf(self.Layers[0].weight).any():
                    print("Backprop")
                    print("Layer idx:", idx, "batch:", i, "iter:", iter, "weight0:", self.Layers[0].weight, "weight1:", self.Layers[1].weight, "weight2:", self.Layers[2].weight)
                    print("a[0]:", self.Layers[0].a, "a[1]:", self.Layers[1].a, "a[2]:", self.Layers[2].a)
                    print("dz[0]:", self.Layers[0].dz, "dz[1]:", self.Layers[1].dz, "dz[2]:", self.Layers[2].dz)
                    raise ValueError()
                
                error = self.cost(self.Layers[-1].a, output)

            #print("iter:", iter, "weight0:", self.Layers[0].weight, "weight1:", self.Layers[1].weight, "weight2:", self.Layers[2].weight)
            #print("iter:", iter, "a[0]:", self.Layers[0].a, "a[1]:", self.Layers[1].a, "a[2]:", self.Layers[2].a)
            #print("iter:", iter, "dz[0]:", self.Layers[0].dz, "dz[1]:", self.Layers[1].dz, "dz[2]:", self.Layers[2].dz)

            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))

    def test_accuracy(self, input, output):
        for layer in self.Layers:
            if layer == self.Layers[0]:
                hypo = layer.forward(input)
            else:
                hypo = layer.forward(hypo)
        prob = np.average((np.argmax(hypo, axis=0) == np.argmax(output, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))

In [None]:
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0 / 10, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0 / 10, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)
model = NeuralNetwork(len(images_train), len(labels_train), hiddenUnits=100, hiddenLayers=1, activation_func='relu')
model.train(images_train, labels_train, batch_size=100, lr=0.0001, EPOCHS=100)

In [None]:
model.test_accuracy(images_test, labels_test)

Test Accuracy:     9.9500


###Try2 => Success

In [None]:
# Multi-Layer NN(ReLU, Softmax) with batch
import numpy as np

class Func():
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    def relu(z):
        return np.maximum(z, 0)
    def softmax(z):
        return np.exp(z) / np.exp(z).sum(axis=0)

class Layer():
    def __init__(self, is_output, units_before, units_after):
        self.weight = np.random.rand(units_after, units_before) * 0.01
        self.bias = np.random.rand(units_after, 1) * 0.01
        if is_output:
            self.act_func = 'softmax'
        else:
            self.act_func = 'relu'
        self.result = None
        self.linear = None
    
    def linear_func(self, input):
        self.linear = np.matmul(self.weight, input) + self.bias
        return self.linear
    def activation_func(self, input):
        if self.act_func == 'softmax':
            self.result = Func.softmax(input)
        elif self.act_func == 'relu':
            self.result = Func.relu(input)
        else:
            raise ValueError("Activation Function named " + self.act_func + " is not defined in Func class")
        return self.result
    def forward(self, input):
        return self.activation_func(self.linear_func(input))

class NeuralNetwork_MultiLayers():
    # input and hidden layers: relu, output layer: softmax, cost: cross-entropy
    def __init__(self, input_units, hidden_units, output_units, hidden_layers):
        self.Layers = [Layer(False, input_units, hidden_units)]
        for i in range(hidden_layers):
            self.Layers.append(Layer(False, hidden_units, hidden_units))
        self.Layers.append(Layer(True, hidden_units, output_units))
    def cost(self, y, hypo):
        # cross entropy
        total = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(total)
    def train(self, features, labels, lr, EPOCHS, batch_size, print_error):
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(features[0]) / batch_size)):
                features_batch = features[:, batch * batch_size : (batch + 1) * batch_size]
                labels_batch = labels[:, batch * batch_size : (batch + 1) * batch_size]
                layer_num = len(self.Layers)
                # forward
                for i in range(layer_num):
                    if i == 0:
                        self.Layers[i].forward(features_batch)
                    else:
                        self.Layers[i].forward(self.Layers[i - 1].result)

                # gradient(BackProp)
                for idx in range(layer_num):
                    idx = layer_num - idx - 1
                    if idx == layer_num - 1:
                        dz = self.Layers[-1].result - labels_batch
                    else:
                        dz = np.multiply(np.matmul(self.Layers[idx + 1].weight.T, dz), np.where(self.Layers[idx].linear > 0, 1, 0))
                    if idx - 1 < 0:
                        dw = np.matmul(dz, features_batch.T)
                    else:
                        dw = np.matmul(dz, self.Layers[idx - 1].result.T)
                    db = np.reshape(np.average(dz, axis=1), (len(dz), 1))
                    self.Layers[idx].weight -= dw * lr
                    self.Layers[idx].bias -= db * lr

                error = self.cost(labels_batch, self.Layers[-1].result)
                if np.isnan(error).any() or np.isinf(error).any():
                    raise ValueError("Training Error: Cost value is NAN or INF")
            if print_error:
                if iter % (EPOCHS / 10) == 0:
                    print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, features, labels):
        for i in range(len(self.Layers)):
            if i == 0:
                self.Layers[i].forward(features)
            else:
                self.Layers[i].forward(self.Layers[i - 1].result)
        prob = np.average((np.argmax(self.Layers[-1].result, axis=0) == np.argmax(labels, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
        return prob

Load MNIST dataset

In [None]:
import numpy as np
import tensorflow as tf # imported for load MNIST data

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

Init a NN model, train, and test

In [None]:
model = NeuralNetwork_MultiLayers(len(images_train), hidden_units=100, output_units=len(labels_train), hidden_layers=2)
model.train(images_train, labels_train, 0.0001, EPOCHS=100, batch_size=100, print_error=True)
model.test_accuracy(images_test, labels_test)

iter:    0 error:     2.2817
iter:   10 error:     1.9758
iter:   20 error:     1.9078
iter:   30 error:     1.8648
iter:   40 error:     1.3606
iter:   50 error:     1.0760
iter:   60 error:     0.7264
iter:   70 error:     0.2138
iter:   80 error:     0.1784
iter:   90 error:     0.1526
iter:  100 error:     0.1362
Test Accuracy:    94.1900


94.19

Find Optimal hyper-parameters(learning_rate, batch_size) if the number of hidden layers and hidden units are changed

In [None]:
print("Finding optimal learning_rate")
learning_rates = [0.1 / (10 ** i) for i in range(5)]
results = []
for lr in learning_rates:
    model = NeuralNetwork_MultiLayers(len(images_train), hidden_units=100, output_units=len(labels_train), hidden_layers=2) # reset parameters
    try:
        model.train(images_train, labels_train, lr, EPOCHS=100, batch_size=100, print_error=False)
        result = model.test_accuracy(images_test, labels_test)
        print("lr:", lr, "accuracy:", result)
    except:
        result = 0
        print("lr:", lr, "Error in training")
    results.append(result)

best_lr = learning_rates[np.argmax(results)]
print("best_lr", best_lr, "test result:", np.max(results))

print("\nFinding optimal batch_size")
batch_sizes = [1 * (10 ** i) for i in range(4)]
results = []
for bs in batch_sizes:
    model = NeuralNetwork_MultiLayers(len(images_train), hidden_units=100, output_units=len(labels_train), hidden_layers=2) # reset parameters
    try:
        model.train(images_train, labels_train, best_lr, EPOCHS=100, batch_size=bs, print_error=False)
        result = model.test_accuracy(images_test, labels_test)
        print("bs:", bs, "accuracy:", result)
    except:
        result = 0
        print("bs:", bs, "Error in training")
    results.append(result)
best_bs = batch_sizes[np.argmax(results)]
print("best_bs", best_bs, "test result:", np.max(results))

Finding optimal learning_rate
lr: 0.1 Error in training
lr: 0.01 Error in training


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


lr: 0.001 Error in training
Test Accuracy:    93.9700
lr: 0.0001 accuracy: 93.97
Test Accuracy:    20.9400
lr: 1e-05 accuracy: 20.94
best_lr 0.0001 test result: 93.97

Finding optimal batch_size
bs: 1 Error in training
bs: 10 Error in training
bs: 100 Error in training


In [None]:
print(np.max(results))

93.4


#Using Tensorflow

In [40]:
import tensorflow as tf
import numpy as np

class Layer():
    def __init__(self, input_unit, output_unit, act_func):
        self.weight = tf.Variable(tf.random.normal([output_unit, input_unit]) * 0.1)
        self.bias = tf.Variable(tf.random.normal([output_unit, 1]) * 0.1)
        self.Variables = [self.weight, self.bias]
        self.act_func = act_func
        self.result = None
    def linear_func(self, input):
        return tf.matmul(self.weight, input) + self.bias
    def forward(self, input):
        linear = self.linear_func(input)
        if self.act_func == 'softmax':
            return tf.nn.softmax(linear)
        elif self.act_func == 'relu':
            return tf.nn.relu(linear)
        elif self.act_func == 'sigmoid':
            return tf.sigmoid(linear)
        else:
            raise ValueError("Activation Function Name Error")

class NeuralNetwork():
    def __init__(self):
        self.Layers = []
        self.Variables = []
    def forward(self, input):
        result = [self.Layers[0].forward(input)]
        for layer in self.Layers[1:]:
            result.append(layer.forward(result[-1]))
        return result[-1]
    def addLayer(self, input_unit, output_unit, act_func):
        self.Layers.append(Layer(input_unit, output_unit, act_func))
        self.Variables.extend(self.Layers[-1].Variables)
    def loss_fn(self, input, output):
        cost1 = tf.multiply(output, tf.math.log(self.forward(input)))
        cost = tf.reduce_sum(cost1, axis=0)
        return -tf.reduce_mean(cost)
    def grad(self, input, output):
        with tf.GradientTape() as tape:
            cost = self.loss_fn(input, output)
            return tape.gradient(cost, self.Variables)
    def train(self, input_, output_, learning_rate, EPOCHS, batch_size):
        optimizer = tf.keras.optimizers.SGD(learning_rate = learning_rate)
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(input_[0]) / batch_size)):
                input = input_[:, batch * batch_size : (batch + 1) * batch_size]
                output = output_[:, batch * batch_size : (batch + 1) * batch_size]
                grads = self.grad(input, output)
                optimizer.apply_gradients(grads_and_vars=zip(grads, self.Variables))
                error = self.loss_fn(input, output)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, input, output):
        prob = np.average((np.argmax(self.forward(input), axis=0) == np.argmax(output, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
        return prob

In [41]:
import numpy as np
import tensorflow as tf # imported for load MNIST data

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)
model = NeuralNetwork()
model.addLayer(len(images_train), 10, 'relu')
model.addLayer(10, 20, 'relu')
model.addLayer(20, 30, 'relu')
model.addLayer(30, len(labels_train), 'softmax')
model.train(images_train, labels_train, 0.01, 100, 100)
model.test_accuracy(images_test, labels_test)

iter:    0 error:     4.6003
iter:   10 error:     2.9146
iter:   20 error:     2.7718
iter:   30 error:     2.7084
iter:   40 error:     2.6837
iter:   50 error:     2.6561
iter:   60 error:     2.6375
iter:   70 error:     2.6219
iter:   80 error:     2.6072
iter:   90 error:     2.5995
iter:  100 error:     2.5910
Test Accuracy:    92.9100


92.91

In [34]:
# with no batch
import tensorflow as tf
import numpy as np

class Layer():
    def __init__(self, input_unit, output_unit, act_func):
        self.weight = tf.Variable(tf.random.normal([output_unit, input_unit]) * 0.1)
        self.bias = tf.Variable(tf.random.normal([output_unit, 1]) * 0.1)
        self.Variables = [self.weight, self.bias]
        self.act_func = act_func
        self.result = None
    def linear_func(self, input):
        return tf.matmul(self.weight, input) + self.bias
    def forward(self, input):
        linear = self.linear_func(input)
        if self.act_func == 'softmax':
            return tf.nn.softmax(linear)
        elif self.act_func == 'relu':
            return tf.nn.relu(linear)
        elif self.act_func == 'sigmoid':
            return tf.sigmoid(linear)
        else:
            raise ValueError("Activation Function Name Error")

class NeuralNetwork():
    def __init__(self):
        self.Layers = []
        self.Variables = []
    def forward(self, input):
        result = [self.Layers[0].forward(input)]
        for layer in self.Layers[1:]:
            result.append(layer.forward(result[-1]))
        return result[-1]
    def addLayer(self, input_unit, output_unit, act_func):
        self.Layers.append(Layer(input_unit, output_unit, act_func))
        self.Variables.extend(self.Layers[-1].Variables)
    def loss_fn(self, input, output):
        cost1 = tf.multiply(output, tf.math.log(self.forward(input)))
        cost = tf.reduce_sum(cost1, axis=0)
        return -tf.reduce_mean(cost)
    def grad(self, input, output):
        with tf.GradientTape() as tape:
            cost = self.loss_fn(input, output)
            return tape.gradient(cost, self.Variables)
    def train(self, input, output, learning_rate, EPOCHS):
        optimizer = tf.keras.optimizers.SGD(learning_rate = learning_rate)
        for iter in range(EPOCHS + 1):
            grads = self.grad(input, output)
            optimizer.apply_gradients(grads_and_vars=zip(grads, self.Variables))
            error = self.loss_fn(input, output)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, input, output):
        prob = np.average((np.argmax(self.forward(input), axis=0) == np.argmax(output, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
        return prob