In [10]:
from keras.datasets import mnist
from keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [79]:
import numpy as np


def ReLU(x):
    return np.maximum(x, 0)


def D_ReLU(x):
    return np.diag((x >= 0).astype(float))


def SoftMax(x):
    tmp = np.exp(x - np.max(x))
    return tmp / np.sum(tmp)


def D_SoftMax(x):
    tmp = SoftMax(x)
    return -1 * np.outer(tmp, tmp) + np.diag(tmp)


def CategoricalCrossEntropy(y_pred, y_true):
    tmp = np.clip(y_pred, 1e-8, 1-1e-8)
    return np.mean(-1 * np.sum(y_true * np.log2(tmp) + (1 - y_true) * np.log2(1 - tmp), axis=1), axis=0)


def D_CategoricalCrossEntropy(y_pred, y_true):
    return y_pred - y_true
 

def L2_loss(y_pred, y_true):
    return np.mean(np.sum((y_pred - y_true)**2, axis=1), axis=0)


def D_L2_loss(y_pred, y_true):
    return 2 * (y_pred - y_true) / y_pred.shape[0]


class DenseLayer:

    def __init__(self, n_inputs, n_outputs, activation, D_activation):
        self.weights = np.random.randn(n_outputs, n_inputs) / np.sqrt(n_inputs * n_outputs)
        self.bias = np.random.randn(n_outputs) / np.sqrt(n_outputs)
        self.activation = activation
        self.D_activation = D_activation
        self.x = None
        self.D_y = None
        self.weights_error_lst = []
        self.bias_error_lst = []
        self.weights_error_rp = 0
        self.bias_error_rp = 0
    
    def forward(self, x):
        self.x = x
        z = np.dot(self.weights, x) + self.bias
        self.D_y = self.D_activation(z)
        y = self.activation(z)
        return y

    def backward(self, output_error):
        bias_error = np.dot(self.D_y.T, output_error)
        self.bias_error_lst.append(bias_error)
        weights_error = np.outer(bias_error, self.x)
        self.weights_error_lst.append(weights_error)
        input_error = np.dot(self.weights.T, bias_error)
        return input_error
    
    def update(self, learning_rate, adaption_rate):
        weights_error = np.mean(np.array(self.weights_error_lst), axis=0)
        self.weights_error_rp = np.clip(adaption_rate * self.weights_error_rp + (1 - adaption_rate) * weights_error**2, 1e-9, np.inf)
        self.weights -= learning_rate / (np.sqrt(self.weights_error_rp)) * weights_error
        self.weights_error_lst = []
        bias_error = np.mean(np.array(self.bias_error_lst), axis=0)
        self.bias_error_rp = np.clip(adaption_rate * self.bias_error_rp + (1 - adaption_rate) * bias_error**2, 1e-9, np.inf)
        self.bias -= learning_rate / (np.sqrt(self.bias_error_rp)) * bias_error
        self.bias_error_lst = []

    def reset(self):
        self.x = None
        self.D_y = None
        self.weights_error_lst = []
        self.bias_error_lst = []
        self.weights_error_rp = 0
        self.bias_error_rp = 0


class Network:

    def __init__(self):
        self.layer_lst = []

    def predict(self, X):
        y_pred_lst = []
        for x in X:
            y_pred_lst.append(self._forward(x))
        y_pred = np.array(y_pred_lst)
        return y_pred

    def _forward(self, x):
        y_pred = x
        for layer in self.layer_lst:
            y_pred = layer.forward(y_pred)
        return y_pred
    
    def _backward(self, y):
        y_pred = y
        for layer in reversed(self.layer_lst):
            y_pred = layer.backward(y_pred)
        return y_pred

    def _update(self, learning_rate, adaption_rate):
        for layer in self.layer_lst:
            layer.update(learning_rate, adaption_rate)

    def _reset(self):
        for layer in self.layer_lst:
            layer.reset()

    def fit(self, X_train, y_train, learning_rate=1e-1, adaption_rate=0.9, N_epochs=1000, N_batch=100):
        self._reset()
        for index_epoch in range(N_epochs):
            batch_indices = np.random.randint(0, X_train.shape[0] - 1, [N_batch])
            X_batch = X_train[batch_indices]
            y_batch = y_train[batch_indices]
            y_pred = np.full_like(y_batch, fill_value=np.nan)
            for index_batch, (x, y) in enumerate(zip(X_batch, y_batch)):
                y_pred[index_batch, :] = self._forward(x)
                self._backward(D_CategoricalCrossEntropy(y_pred[index_batch, :], y))
            self._update(learning_rate, adaption_rate)
            loss = CategoricalCrossEntropy(y_pred, y_batch)
            print("Epoch {}/{} Loss: {}".format(int(index_epoch + 1), N_epochs, loss))


N_epochs = 500
N_batch = 100
N_inputs = train_images.shape[1]
N_outputs = train_labels.shape[1]
N_hidden_layer = 2
learning_rate = 1e-3
adaption_rate = 0.9

network = Network()
network.layer_lst.append(DenseLayer(N_inputs, N_inputs, ReLU, D_ReLU))
network.layer_lst.extend([DenseLayer(N_inputs, N_inputs, ReLU, D_ReLU) for _ in range(N_hidden_layer)])
network.layer_lst.append(DenseLayer(N_inputs, N_outputs, SoftMax, D_SoftMax))
network.fit(train_images, train_labels, learning_rate, adaption_rate, N_epochs, N_batch)

Epoch 1/500 Loss: 4.748495578765869
Epoch 2/500 Loss: 4.861515045166016
Epoch 3/500 Loss: 4.797658443450928
Epoch 4/500 Loss: 4.752599239349365
Epoch 5/500 Loss: 4.68756628036499
Epoch 6/500 Loss: 4.7170305252075195
Epoch 7/500 Loss: 4.7786478996276855
Epoch 8/500 Loss: 4.741768836975098
Epoch 9/500 Loss: 4.740433216094971
Epoch 10/500 Loss: 4.701882839202881
Epoch 11/500 Loss: 4.652900218963623
Epoch 12/500 Loss: 4.663219451904297
Epoch 13/500 Loss: 4.629430294036865
Epoch 14/500 Loss: 4.675034046173096
Epoch 15/500 Loss: 4.615380764007568
Epoch 16/500 Loss: 4.614375114440918
Epoch 17/500 Loss: 4.617863655090332
Epoch 18/500 Loss: 4.549555778503418
Epoch 19/500 Loss: 4.483547210693359
Epoch 20/500 Loss: 4.527151584625244
Epoch 21/500 Loss: 4.393606185913086
Epoch 22/500 Loss: 4.721968173980713
Epoch 23/500 Loss: 4.490753173828125
Epoch 24/500 Loss: 4.353170871734619
Epoch 25/500 Loss: 4.179644584655762
Epoch 26/500 Loss: 4.254596710205078
Epoch 27/500 Loss: 4.249398231506348
Epoch 28/

  return np.mean(-1 * np.sum(y_true * np.log2(tmp) + (1 - y_true) * np.log2(1 - tmp), axis=1), axis=0)
  return np.mean(-1 * np.sum(y_true * np.log2(tmp) + (1 - y_true) * np.log2(1 - tmp), axis=1), axis=0)


Epoch 151/500 Loss: nan
Epoch 152/500 Loss: 1.848251461982727
Epoch 153/500 Loss: 1.880061149597168
Epoch 154/500 Loss: 2.217270612716675
Epoch 155/500 Loss: 1.5706329345703125
Epoch 156/500 Loss: 1.9271140098571777
Epoch 157/500 Loss: 1.8588247299194336
Epoch 158/500 Loss: 1.5861353874206543
Epoch 159/500 Loss: 1.3591517210006714
Epoch 160/500 Loss: 1.7333853244781494
Epoch 161/500 Loss: 1.44783616065979
Epoch 162/500 Loss: 1.832228422164917
Epoch 163/500 Loss: 1.5498133897781372
Epoch 164/500 Loss: 1.8467499017715454
Epoch 165/500 Loss: 1.7699203491210938
Epoch 166/500 Loss: 1.3292169570922852
Epoch 167/500 Loss: 1.5200821161270142
Epoch 168/500 Loss: 1.9574588537216187
Epoch 169/500 Loss: 1.4956345558166504
Epoch 170/500 Loss: 1.3851425647735596
Epoch 171/500 Loss: 1.1459887027740479
Epoch 172/500 Loss: nan
Epoch 173/500 Loss: 1.8185843229293823
Epoch 174/500 Loss: 2.2549004554748535
Epoch 175/500 Loss: 1.6822094917297363
Epoch 176/500 Loss: 1.6098151206970215
Epoch 177/500 Loss: 1.

In [80]:
predicted_labels = network.predict(test_images)
accuracy = np.sum(np.argmax(predicted_labels, axis=1) == np.argmax(test_labels, axis=1)) / predicted_labels.shape[0] * 100
print("Accuracy: {:.1f}%".format(accuracy))

Accuracy: 93.9%
