All codes are based on "Neural Networks and Deep Learning" Course by Deeplearning.ai

In [None]:
# 2 Layer NN(ReLU, Softmax) with no batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_2():
    def __init__(self, n0, n1, n2):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS):
        for iter in range(EPOCHS + 1):
            # layer1: relu, layer2: softmax, cost: cross-entropy
            z1 = self.linear_hypo(self.weight1, features, self.bias1)
            layer1 = self.hypothesis('relu', z1)
            z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
            layer2 = self.hypothesis('softmax', z2)
            # gradient
            dz2 = layer2 - labels
            dw2 = np.matmul(dz2, layer1.T)
            db2 = np.average(dz2, axis=1)

            dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0)) # relu gradient: if x > 0: 1 else: 0 => np.where(z1 > 0, 1, 0)
            dw1 = np.matmul(dz1, features.T)
            db1 = np.average(dz1, axis=1)

            self.weight2 = self.weight2 - dw2 * learning_rate
            self.weight1 = self.weight1 - dw1 * learning_rate
            self.bias2 = self.bias2 - db2 * learning_rate
            self.bias1 = self.bias1 - db1 * learning_rate

            error = self.cost(labels, layer2)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.001, 100)


iter:    0 error:     2.3499
iter:   10 error:     2.3504
iter:   20 error:     2.3503
iter:   30 error:     2.3502
iter:   40 error:     2.3501
iter:   50 error:     2.3500
iter:   60 error:     2.3499
iter:   70 error:     2.3499
iter:   80 error:     2.3498
iter:   90 error:     2.3497
iter:  100 error:     2.3496


In [None]:
# 2 Layer NN(ReLU, Softmax) with batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_2():
    def __init__(self, n0, n1, n2):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0) 
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS, batch_size):
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(features[0]) / batch_size)):
                features_batch = features[:, batch * batch_size : (batch + 1) * batch_size]
                labels_batch = labels[:, batch * batch_size : (batch + 1) * batch_size]
                # layer1: relu, layer2: softmax, cost: cross-entropy
                z1 = self.linear_hypo(self.weight1, features_batch, self.bias1)
                layer1 = self.hypothesis('relu', z1)
                z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
                layer2 = self.hypothesis('softmax', z2)
                # gradient
                dz2 = layer2 - labels_batch #this is same result when you use sigmoid and sigmoid's cost function(y*log(hypo) + (1-y)*log(1-hypo))
                dw2 = np.matmul(dz2, layer1.T)
                db2 = np.average(dz2, axis=1)

                dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0)) # relu gradient: if x > 0: 1 else: 0 => np.where(z1 > 0, 1, 0)
                dw1 = np.matmul(dz1, features_batch.T)
                db1 = np.average(dz1, axis=1)

                self.weight2 = self.weight2 - dw2 * learning_rate
                self.weight1 = self.weight1 - dw1 * learning_rate
                self.bias2 = self.bias2 - db2 * learning_rate
                self.bias1 = self.bias1 - db1 * learning_rate

                error = self.cost(labels_batch, layer2)
            #if iter % (EPOCHS / 10) == 0:
            #    print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, features, labels):
        hypothesis = self.hypothesis('softmax', self.linear_hypo(self.weight2, self.hypothesis('relu', self.linear_hypo(self.weight1, features, self.bias1)), self.bias2))
        prob = np.average((np.argmax(hypothesis, axis=0) == np.argmax(labels, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
model = NeuralNetwork_2(len(images_train), 100, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Test Accuracy:    96.5800


#Find best setting of model evaluated by Test Accuracy

In [None]:
# Test Accuracy comparison with changing learning_rate - 0.0001 is the best
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.1, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.01, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.00001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.000001, 100, 50)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    10.2800
Test Accuracy:    86.4100
Test Accuracy:    88.7300
Test Accuracy:    89.4500
Test Accuracy:    86.4200
Test Accuracy:    12.1900


In [None]:
# Test Accuracy comparison with changing batch_size - 100 is the best
model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 10)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 50)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 5, 10)
model.train(images_train, labels_train, 0.0001, 100, 1000)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    89.5300
Test Accuracy:    89.5200
Test Accuracy:    89.7100
Test Accuracy:    89.6400


In [None]:
# Test Accuracy comparison with changing number of hidden layer nodes - 100 is the best
model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 50, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 100, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 1000, 10) # too much time needed to calculate
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    95.2200
Test Accuracy:    95.9300
Test Accuracy:    96.3200
Test Accuracy:    95.4800


In [None]:
# Test Accuracy comparison with changing EPOCHS - as many as possible
# 100 nodes of hidden layer's calculation takes too much time so decreased it to 20
model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 10, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

model = NeuralNetwork_2(len(images_train), 20, 10)
model.train(images_train, labels_train, 0.0001, 1000, 100)
model.test_accuracy(images_test, labels_test)

Test Accuracy:    90.9200
Test Accuracy:    94.8000
Test Accuracy:    95.8800


In [114]:
# 3 Layer NN(ReLU, Softmax) with no batch
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)

class NeuralNetwork_3():
    def __init__(self, n0, n1, n2, n3):
        self.weight1 = np.random.rand(n1, n0) * 0.01
        self.weight2 = np.random.rand(n2, n1) * 0.01
        self.weight3 = np.random.rand(n3, n2) * 0.01
        self.bias1 = np.random.rand(n1)
        self.bias2 = np.random.rand(n2)
        self.bias3 = np.random.rand(n3)
    def linear_hypo(self, w, x, b):
        return (np.matmul(w, x).T + b).T
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def relu(self, z):
        return np.maximum(z, 0)
    def softmax(self, z):
        return np.exp(z) / np.exp(z).sum(axis=0)
    def hypothesis(self, func, lh):
        if func == 'sigmoid':
            return self.sigmoid(lh)
        elif func == 'relu':
            return self.relu(lh)
        elif func == 'softmax':
            return self.softmax(lh)
        else:
            sys.exit('Error in hypothesis: There is no {} function'.format(func))
    def cost(self, y, hypo):
        # cross entropy
        sum = np.multiply(y, np.log(hypo)).sum(axis=0)
        return -np.average(sum)
    def train(self, features, labels, learning_rate, EPOCHS, batch_size):
        for iter in range(EPOCHS + 1):
            for batch in range(int(len(features[0]) / batch_size)):
                features_batch = features[:, batch * batch_size : (batch + 1) * batch_size]
                labels_batch = labels[:, batch * batch_size : (batch + 1) * batch_size]
                # layer1: relu, layer2: relu, layer3: softmax, cost: cross-entropy
                z1 = self.linear_hypo(self.weight1, features_batch, self.bias1)
                layer1 = self.hypothesis('relu', z1)
                z2 = self.linear_hypo(self.weight2, layer1, self.bias2)
                layer2 = self.hypothesis('relu', z2)
                z3 = self.linear_hypo(self.weight3, layer2, self.bias3)
                layer3 = self.hypothesis('softmax', z3)
                # gradient
                dz3 = layer3 - labels_batch
                dw3 = np.matmul(dz3, layer2.T)
                db3 = np.average(dz3, axis=1)

                dz2 = np.multiply(np.matmul(self.weight3.T, dz3), np.where(z2 > 0, 1, 0))
                dw2 = np.matmul(dz2, layer1.T)
                db2 = np.average(dz2, axis=1)

                dz1 = np.multiply(np.matmul(self.weight2.T, dz2), np.where(z1 > 0, 1, 0))
                dw1 = np.matmul(dz1, features_batch.T)
                db1 = np.average(dz1, axis=1)

                self.weight3 = self.weight3 - dw3 * learning_rate
                self.weight2 = self.weight2 - dw2 * learning_rate
                self.weight1 = self.weight1 - dw1 * learning_rate
                self.bias3 = self.bias3 - db3 * learning_rate
                self.bias2 = self.bias2 - db2 * learning_rate
                self.bias1 = self.bias1 - db1 * learning_rate

                error = self.cost(labels_batch, layer3)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))
    def test_accuracy(self, features, labels):
        layer1 = self.linear_hypo(self.weight1, features, self.bias1)
        layer1 = self.hypothesis('relu', layer1)
        layer2 = self.linear_hypo(self.weight2, layer1, self.bias2)
        layer2 = self.hypothesis('relu', layer2)
        layer3 = self.linear_hypo(self.weight3, layer2, self.bias3)
        hypothesis = self.hypothesis('softmax', layer3)
        prob = np.average((np.argmax(hypothesis, axis=0) == np.argmax(labels, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))
model = NeuralNetwork_3(len(images_train), 100, 100, len(labels_train))
model.train(images_train, labels_train, 0.0001, 100, 100)
model.test_accuracy(images_test, labels_test)

iter:    0 error:     2.2805
iter:   10 error:     1.0050
iter:   20 error:     0.4509
iter:   30 error:     0.3243
iter:   40 error:     0.2571
iter:   50 error:     0.2117
iter:   60 error:     0.1912
iter:   70 error:     0.1804
iter:   80 error:     0.1713
iter:   90 error:     0.1640
iter:  100 error:     0.1593
Test Accuracy:    96.4900


#Multi Layers

In [106]:
import numpy as np

class Func():
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    def relu(z):
        return np.maximum(z, 0)
    def softmax(z): #same column: one example => sum(axis=0): sum of all values of one example and divide each classes of one example by this sum
        return np.exp(z) / np.exp(z).sum(axis=0) #sum(axis=0) => add same column numbers and so #rows become 1
    def softmax_modified(z):
        #since softmax has exp and exp can make numbers too large very easily and it causes overflow in caculation
        z = (z - z.min()) / (z.max() - z.min())
        #print("z:", z, "softmax:", np.exp(z) / np.exp(z).sum(axis=0))
        return np.exp(z) / np.exp(z).sum(axis=0)

class Layer():
    def __init__(self, input_size, output_size, act_func):
        self.weight = np.random.rand(output_size, input_size) * 0.01
        self.bias = np.random.rand(output_size, 1) * 0.01
        self.act_func = act_func
        self.cache = None
    def linear(self, input):
        self.cache = np.matmul(self.weight, input) + self.bias #size: (n1, n0) * (n0, m) + (broadcasting)(n1, 1) => (n1, m)
        return self.cache
    def activate_func(self, input):
        if self.act_func == 'sigmoid':
            return Func.sigmoid(input)
        elif self.act_func == 'relu':
            return Func.relu(input)
        elif self.act_func == 'softmax':
            return Func.softmax_modified(input)
    def output(self, input):
        return self.forward(input)
    def forward(self, input):
        return self.activate_func(self.linear(input))
    def backward(self, gradient_next, hypo_before, lr): #next: next layer(outputlayer if this is hidden in 3layerNN), before: before layer(inputlayer if this is hidden in 3layerNN)
        if self.act_func == 'relu':
            dz = np.multiply(gradient_next, np.where(self.cache > 0, 1, 0))
        else: #sigmoid or softmax
            dz = gradient_next
        dw = np.matmul(dz, hypo_before.T)
        db = np.average(dz, axis=1)

        self.weight = self.weight - lr * dw
        if np.isnan(self.weight).any():
            print("BackProp Nan")
            print("dw:", dw, "dz", dz, "hypo_before", hypo_before)
            return None
        self.db = self.bias - lr * db
        return dz

class NeuralNetwork():
    def __init__(self, inputUnits, outputUnits, hiddenUnits, hiddenLayers, activation_func):
        self.Layers = [Layer(inputUnits, hiddenUnits, activation_func)]
        for i in range(hiddenLayers):
            self.Layers.append(Layer(hiddenUnits, hiddenUnits, activation_func))
        self.Layers.append(Layer(hiddenUnits, outputUnits, 'softmax'))

    def cost(self, hypo, label):
        #cross entropy
        total = np.multiply(label, np.log(hypo)).sum(axis=0)
        return -np.average(total)

    def train(self, input_, output_, batch_size, lr, EPOCHS):
        for iter in range(EPOCHS + 1):
            for i in range(int(len(input_[0]) / batch_size)):
                input = input_[:, i * batch_size : (i + 1) * batch_size]
                output = output_[:, i * batch_size : (i + 1) * batch_size]
                hypothesis = []

                #forward propagation
                for layer in self.Layers:
                    if layer == self.Layers[0]:
                        hypothesis.append(layer.forward(input))
                    else:
                        hypothesis.append(layer.forward(hypothesis[-1]))

                #Back propagation
                gradient = self.Layers[-1].backward(hypothesis[-1] - output, hypothesis[-2], lr)
                l = len(hypothesis)
                for idx in range(2, l):
                    temp = np.matmul(self.Layers[l - idx + 1].weight.T, gradient)
                    gradient = self.Layers[l - idx].backward(temp, hypothesis[l - idx - 1], lr)
                temp = np.matmul(self.Layers[1].weight.T, gradient)
                self.Layers[0].backward(temp, input, lr)
                
                error = self.cost(hypothesis[-1], output)
            if iter % (EPOCHS / 10) == 0:
                print("iter: {:4} error: {:10.4f}".format(iter, error))

    def test_accuracy(self, input, output):
        for layer in self.Layers:
            if layer == self.Layers[0]:
                hypo = layer.forward(input)
            else:
                hypo = layer.forward(hypo)
        prob = np.average((np.argmax(hypo, axis=0) == np.argmax(output, axis=0))) * 100
        print("Test Accuracy: {:10.4f}".format(prob))    


In [112]:
import sys
import numpy as np
import tensorflow as tf

(images_train, labels_train), (images_test, labels_test) = tf.keras.datasets.mnist.load_data()
images_train = np.reshape(images_train / 255.0, (len(images_train), len(images_train[0]) * len(images_train[0][0]))).T
images_test = np.reshape(images_test / 255.0, (len(images_test), len(images_test[0]) * len(images_test[0][0]))).T

def one_hot_encoding(data):
    encoded = np.zeros((len(data), max(data) - min(data) + 1))
    for idx in range(len(data)):
        encoded[idx][data[idx]] = 1
    return encoded.T

labels_train, labels_test = one_hot_encoding(labels_train), one_hot_encoding(labels_test)
model = NeuralNetwork(len(images_train), len(labels_train), hiddenUnits=100, hiddenLayers=10, activation_func='relu')
model.train(images_train, labels_train, batch_size=100, lr=0.0001, EPOCHS=100)

iter:    0 error:     2.3468
iter:   10 error:     2.2785
iter:   20 error:     2.2777
iter:   30 error:     2.2768
iter:   40 error:     2.2759
iter:   50 error:     2.2750
iter:   60 error:     2.2741
iter:   70 error:     2.2732
iter:   80 error:     2.2723
iter:   90 error:     2.2715
iter:  100 error:     2.2706


Test Accuracy is very low. Need to know why it is that low.

Guess1. Learning rate is very low. But if I increase lr, layer's weight becomes too large and so be Inf and makes error.

In [113]:
model.test_accuracy(images_test, labels_test)

Test Accuracy:    17.7100
