In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train = x_train/255
x_test = x_test/255
y_train_1h = pd.get_dummies(pd.DataFrame(y_train)[0]).to_numpy()
print(x_train.shape, y_train.shape, y_train_1h.shape, x_test.shape, y_test.shape)

(60000, 28, 28) (60000,) (60000, 10) (10000, 28, 28) (10000,)


In [3]:
def adamOptimizer(dxs, first_moments, second_moments, t):
    dxc = []
    fm = []
    sm = []
    for dx, first_moment, second_moment in zip(dxs, first_moments, second_moments):
        first_moment = 0.9 * first_moment + (1-0.9) * dx
        second_moment = 0.999 * second_moment + (1-0.999) * dx * dx
        first_unbias = first_moment / (1 - 0.9 ** t)
        second_unbias = second_moment/ (1 - 0.999 ** t)
        dxc.append(first_unbias / (tf.sqrt(second_unbias) + 1e-7))
        fm.append(first_moment)
        sm.append(second_moment)
    return dxc, fm, sm

def batchNorm(x):
    return (x - (tf.reduce_mean(x, axis=0))) / tf.math.sqrt((tf.math.reduce_variance(x, axis=0))+1e-7)

In [4]:
x = tf.constant(x_train, dtype=tf.float32)
y = tf.constant(y_train, dtype=tf.float32)

In [5]:
initializer = tf.initializers.GlorotUniform()
Wx_0 = tf.Variable(initializer((28, 128), dtype = tf.float32))
Bx_0 = tf.Variable(initializer((1, 128), dtype = tf.float32))
Wh_0 = tf.Variable(initializer((128, 128), dtype = tf.float32))
Bh_0 = tf.Variable(initializer((1, 128), dtype = tf.float32))
W1 = tf.Variable(initializer((128, 10), dtype = tf.float32))
B1 = tf.Variable(initializer((1, 10), dtype = tf.float32))

fm = list(np.zeros(6)); sm = list(np.zeros(6))

metric = tf.metrics.Accuracy()

In [6]:
lr = 1e-4
itr = 10000
for j in range(itr+1):
  with tf.GradientTape() as g:
    g.watch([Wx_0, Bx_0, Wh_0, Bh_0, W1, B1])
    
    #Forward Pass
    x_p = tf.zeros((60000,128))
    for i in range(28):
      x_c = x[:,:,i]
      o_a = tf.matmul(x_c, Wx_0) + Bx_0
      o_b = tf.matmul(x_p, Wh_0) + Bh_0
      o_c = tf.nn.tanh(o_a + o_b)
      x_p = o_c
    a1 = tf.nn.softmax(tf.matmul(x_p, W1) + B1)
    ce = tf.keras.losses.sparse_categorical_crossentropy(y, a1)
    loss = tf.reduce_mean(ce)
    
    #Gradient Calc
    dWx_0, dBx_0, dWh_0, dBh_0, dW1, dB1 = g.gradient(loss, [Wx_0, Bx_0, Wh_0, Bh_0, W1, B1])

    #Adam
    (dWx_0, dBx_0, dWh_0, dBh_0, dW1, dB1), fm, sm = adamOptimizer([dWx_0, dBx_0, dWh_0, dBh_0, dW1, dB1], fm, sm, j+1)

    #Gradient Update
    Wx_0.assign_sub(lr*dWx_0)
    Bx_0.assign_sub(lr*dBx_0)
    Wh_0.assign_sub(lr*dWh_0)
    Bh_0.assign_sub(lr*dBh_0)
    W1.assign_sub(lr*dW1)
    B1.assign_sub(lr*dB1)

    #Accuracy
    metric.update_state(y, tf.argmax(a1,1))
    acc = metric.result().numpy()
    
    #Print
    if j%(itr/20)==0:
      print(j, loss.numpy(), acc)

0 2.4521005 0.1045
500 0.32458463 0.7273989
1000 0.16839373 0.83209646
1500 0.117210194 0.8749983
2000 0.08953927 0.89914435
2500 0.07349394 0.91482615
3000 0.061373618 0.9259552
3500 0.05182281 0.93431455
4000 0.04437688 0.9408682
4500 0.04011797 0.94611776
5000 0.03492346 0.950538
5500 0.031102499 0.954262
6000 0.026891295 0.95746595
6500 0.025616245 0.96004087
7000 0.023745822 0.9624682
7500 0.02171744 0.96461594
8000 0.019442976 0.96654254
8500 0.016892988 0.9682981
9000 0.016669238 0.96960956
9500 0.015586088 0.9710289
10000 0.014809362 0.97231805


In [7]:
x_p = tf.zeros((y_test.shape[0],128))
for i in range(28):
  x_c = tf.constant(x_test, dtype=tf.float32)[:,:,i]
  o_a = tf.matmul(x_c, Wx_0) + Bx_0
  o_b = tf.matmul(x_p, Wh_0) + Bh_0
  o_c = tf.nn.tanh(o_a + o_b)
  x_p = o_c
a1 = tf.nn.softmax(tf.matmul(x_p, W1) + B1)
metric.update_state(tf.constant(y_test, dtype=tf.float32), tf.argmax(a1,1))
"Test Accuracy", metric.result().numpy()

('Test Accuracy', 0.97231805)