In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tqdm import tqdm

In [2]:
def relu(x):
    return np.maximum(x, 0)

def deriv_relu(x):
    return (x > 0).astype(x.dtype)

def softmax(x):
    x -= x.max(axis=1, keepdims=True)
    x_exp = np.exp(x)
    return x_exp / np.sum(x_exp, axis=1, keepdims=True)

def deriv_softmax(x):
    return softmax() * (1 - softmax(x))

In [3]:
class Dense:
    def __init__(self, in_dim, out_dim, function, deriv_function):
        self.W = np.random.uniform(low = -0.1, high = 0.1, size = (in_dim, out_dim)).astype('float64')
        self.b = np.zeros(out_dim).astype('float64')
        self.function = function
        self.deriv_function = deriv_function
        
        self.x = None
        self.u = None
        
        self.dW = None
        self.db = None
            
    def __call__(self, x):
        self.x = x
        self.u = np.matmul(self.x, self.W) + self.b
        return self.function(self.u)
    
    def b_prop(self, delta, W):
        self.delta = self.deriv_function(self.u) * np.matmul(delta, W.T)
        return self.delta
    
    def compute_grad(self):
        batch_size = self.delta.shape[0]
        self.dW = np.matmul(self.x.T, self.delta) / batch_size
        self.db = np.matmul(np.ones(batch_size), self.delta) / batch_size

In [4]:
def foward_props(layers, x):
    for layer in layers:
        x = layer(x)
    return x

In [5]:
def back_props(layers, delta):
    batch_size = delta.shape[0]
    
    for i, layer in enumerate(layers[::-1]):
        if i == 0:
            layer.delta = delta
            layer.compute_grad()
        else:
            delta = layer.b_prop(delta, W)
            layer.compute_grad()
        W  = layer.W

In [6]:
def update_params(layers, lr):
    for layer in layers:
        layer.W -= lr * layer.dW
        layer.b -= lr * layer.db

In [7]:
layers = [
    Dense(784, 100, relu, deriv_relu),
    Dense(100, 100, relu, deriv_relu),
    Dense(100, 10, softmax, deriv_softmax)
]

In [8]:
mnist =fetch_openml('mnist_784', version=1)

x_mnist = mnist.data.astype('float32') / 255.
t_mnist = np.eye(10)[mnist.target.astype('int32')]

In [9]:
x_train_mnist, x_valid_mnist, t_train_mnist, t_valid_mnist = train_test_split(x_mnist, t_mnist, test_size=10000)

In [10]:
def train(x, t, lr = 0.001):
    y = foward_props(layers, x)
    loss = (- t * np.log(y)).sum(axis = 1).mean()
    delta = y - t
    back_props(layers, delta)
    update_params(layers, lr)
    return loss

In [11]:
def val(x, t):
    y = foward_props(layers, x)
    loss = (-t * np.log(y)).sum(axis = 1).mean()
    return loss, y

In [12]:
for epoch in range(3):
    x_train_mnist, t_train_mnist = shuffle(x_train_mnist, t_train_mnist)
    for x, t in zip(tqdm(x_train_mnist), t_train_mnist):
        loss = train(x[None, :], t[None, :], lr = 0.001)
    
    loss, y_pred = val(x_valid_mnist, t_valid_mnist)
    accuracy =  accuracy_score(t_valid_mnist.argmax(axis=1), y_pred.argmax(axis=1))
    print('EPOCH: {}, Valid Cost: {:.3f}, Valid Accuracy: {:.3f}'.format(epoch + 1, loss, accuracy))

100%|██████████| 60000/60000 [01:14<00:00, 809.07it/s] 


EPOCH: 1, Valid Cost: 0.279, Valid Accuracy: 0.918


100%|██████████| 60000/60000 [01:05<00:00, 918.55it/s] 
  0%|          | 0/60000 [00:00<?, ?it/s]

EPOCH: 2, Valid Cost: 0.200, Valid Accuracy: 0.941


100%|██████████| 60000/60000 [00:53<00:00, 1115.29it/s]


EPOCH: 3, Valid Cost: 0.161, Valid Accuracy: 0.952
