In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import math
import numpy as np

In [2]:
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape((60000, 28 * 28)).astype("float32") / 255
test_images = test_images.reshape((10000, 28 * 28)).astype("float32") / 255

In [3]:
class SimpleDense:
    def __init__(self, n_in, n_out, activation):
        self.activation = activation
        w_shape = (n_in, n_out)
        self.w = tf.Variable(tf.random.uniform(w_shape, minval=0, maxval=1e-1))
        b_shape = (n_out,)
        self.b = tf.Variable(tf.zeros(b_shape))
    def __call__(self, x):
        return self.activation(tf.matmul(x, self.w) + self.b)
    @property
    def weights(self):
        return [self.w, self.b]

In [4]:
d = SimpleDense(100,10,lambda x: x)
assert d(tf.Variable(tf.random.uniform((5,100)))).shape == [5,10]
assert len(d.weights) == 2

In [5]:
class SimpleSequential:
    def __init__(self, layers):
        self.layers = layers
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    @property
    def weights(self):
#         return [weights for layer_weights in [layer.weights for layer in s.layers] for weights in layer_weights]
        weights = []
        for layer in self.layers:
            weights += layer.weights
        return weights

In [6]:
s = SimpleSequential([SimpleDense(10,5,lambda x: x), SimpleDense(5,2,lambda x: x)])
assert len(s.weights) == 4

In [7]:
model = SimpleSequential([
    SimpleDense(28*28,512,tf.nn.relu), 
    SimpleDense(512,10,tf.nn.softmax)])

In [8]:
class BatchGenerator:
    def __init__(self, images, labels, bs=128):
        assert len(images)==len(labels)
        self.images, self.labels, self.bs = images, labels, bs
        self.i = 0
        self.n_batches = math.ceil(len(images)/bs)
    def next(self):
        if self.i >= len(self.images):
            self.i = 0
            shuffled_indexes = np.random.permutation(range(len(self.images)))
            self.images, self.labels = self.images[shuffled_indexes], self.labels[shuffled_indexes]
        images, labels = [o[self.i : self.i+self.bs] for o in [self.images, self.labels]]
        self.i += self.bs
        return images, labels        

In [9]:
batch_gen = BatchGenerator(test_images, test_labels)
first_batch = batch_gen.next()
assert first_batch[1].shape == (128,)
for i in range(batch_gen.n_batches-2): batch_gen.next()
last_batch = batch_gen.next()
assert last_batch[1].shape == (16,)
assert batch_gen.i == 10112
first_batch_again = batch_gen.next()
assert first_batch[1].shape == (128,)
assert batch_gen.i == 128

In [10]:
type(train_images)

numpy.ndarray

In [11]:
images, labels = [f'a{i}' for i in range(10)], list(range(10))
batch_gen = BatchGenerator(images, labels, 2)
batch_gen.next(), batch_gen.i

((['a0', 'a1'], [0, 1]), 2)

In [12]:
batch_gen = BatchGenerator(train_images, train_labels)

In [13]:
model(batch_gen.next()[0])

<tf.Tensor: shape=(128, 10), dtype=float32, numpy=
array([[7.7961124e-02, 8.3780307e-01, 1.2872348e-03, ..., 1.5569431e-02,
        2.6265509e-06, 3.2489405e-07],
       [6.0034063e-02, 8.8943893e-01, 5.1579502e-04, ..., 9.6332002e-03,
        3.6768196e-07, 6.2927931e-08],
       [1.5948652e-01, 6.4298207e-01, 6.7167785e-03, ..., 4.1663308e-02,
        6.4725791e-05, 2.2078531e-05],
       ...,
       [8.5260063e-02, 8.1263644e-01, 1.9896722e-03, ..., 1.9133260e-02,
        3.2622779e-06, 6.6956204e-07],
       [6.8221211e-02, 8.6073768e-01, 1.2657690e-03, ..., 1.4047344e-02,
        1.3826368e-06, 2.7396595e-07],
       [1.2595120e-01, 7.0753396e-01, 3.7248274e-03, ..., 2.8168768e-02,
        1.8573486e-05, 3.9177071e-06]], dtype=float32)>

In [14]:
def one_training_step(model, images_batch, labels_batch):
    with tf.GradientTape() as tape:
        predictions = model(images_batch)
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(labels_batch, predictions)
        average_loss = tf.reduce_mean(per_sample_losses)
    gradients = tape.gradient(average_loss, model.weights)
    update_weights(gradients, model.weights)
    return average_loss

In [15]:
# learning_rate = 1e-3

# def update_weights(gradients, weights):
#     for g, w in zip(gradients, weights):
#         w.assign_sub(g * learning_rate)

In [16]:
from tensorflow.keras import optimizers

optim = optimizers.SGD(1e-3)

def update_weights(gradients, weights):
    optim.apply_gradients(zip(gradients, weights))

In [17]:
ib, lb = batch_gen.next()
one_training_step(model, ib, lb)

<tf.Tensor: shape=(), dtype=float32, numpy=5.7799664>

In [21]:
model = SimpleSequential([
    SimpleDense(28*28,512,tf.nn.relu), 
    SimpleDense(512,10,tf.nn.softmax)])
batch_gen = BatchGenerator(train_images, train_labels)
for epoch in range(5):
    print('epoch', epoch)
    for i, batch in enumerate(range(batch_gen.n_batches)):
        images_batch, labels_batch = batch_gen.next()
        average_loss = one_training_step(model, images_batch, labels_batch)
        if i%100 == 0:
            print(i, 'average_loss', average_loss)
            
predictions = model(test_images)
predictions = predictions.numpy()
predicted_labels = predictions.argmax(axis=1)
print('test acc', (predicted_labels==test_labels).mean())

epoch 0
0 average_loss tf.Tensor(8.067356, shape=(), dtype=float32)
100 average_loss tf.Tensor(2.1875548, shape=(), dtype=float32)
200 average_loss tf.Tensor(2.1630116, shape=(), dtype=float32)
300 average_loss tf.Tensor(2.0339882, shape=(), dtype=float32)
400 average_loss tf.Tensor(2.1619883, shape=(), dtype=float32)
epoch 1
0 average_loss tf.Tensor(1.9044305, shape=(), dtype=float32)
100 average_loss tf.Tensor(1.8781254, shape=(), dtype=float32)
200 average_loss tf.Tensor(1.9063525, shape=(), dtype=float32)
300 average_loss tf.Tensor(1.7369076, shape=(), dtype=float32)
400 average_loss tf.Tensor(1.6588811, shape=(), dtype=float32)
epoch 2
0 average_loss tf.Tensor(1.9068744, shape=(), dtype=float32)
100 average_loss tf.Tensor(1.7090285, shape=(), dtype=float32)
200 average_loss tf.Tensor(1.5193205, shape=(), dtype=float32)
300 average_loss tf.Tensor(1.4650822, shape=(), dtype=float32)
400 average_loss tf.Tensor(1.3945763, shape=(), dtype=float32)
epoch 3
0 average_loss tf.Tensor(1.463

Initial test acc 0.7627

test acc 0.7861 with shuffle but I think acc varies by +/- 2% anyway