### Training a keras model using the MNIST dataset

Using the MNIST dataset & training a multiclass classification model.
This is loosely adapted from this [keras exercise](https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/courses/machine_learning/deepdive2/production_ml/labs/keras.ipynb)
and is for my own understanding. 

#### The objectives of this Notebook for me
* using tfds (tensorflow_datasets) library
* using tf.data.Dataset object in training (instead of arrays)
* using distributed training strategies

#### Note to self for some outside-of-notebook operations
```bash
conda activate ml
jupyter notebook
# now select the jupyter server to run from VSCode
tensorboard --logdir=logs   
# should launch tensorboard visible from browser
```

### Load dependencies

In [118]:
import tensorflow_datasets as tfds
import tensorflow as tf
import os
# Seems that Adam optimizer with tf 2.10 onwards has some quirk that doesn't work with the below. 
# Should figure out, but in the meanwhile this worked
from tensorflow.keras.optimizers.legacy import Adam


In [119]:
# This notebook used 2.11.0
print(tf.__version__)

2.11.0


### Load data using tfds as a tf.data.Dataset

In [120]:
# Load mnist dataset using tfds as tf.data.Dataset
datasets = tfds.load('mnist', with_info=True, as_supervised=True)
# Note that if I don't use as_supervised, I'll need to convert the dataset into arrays/iterables before feeding into .fit
info = datasets[1]
mnist_train, mnist_test = datasets[0]['train'], datasets[0]['test']

In [121]:
# play with data
mnist_test
info


tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_path='/Users/emmamatsubara/tensorflow_datasets/mnist/3.0.1',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",


### Set up distributed training & model arch

In [122]:
# Set up distributed training strategy
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [123]:
# Checking hardware
print(tf.config.experimental.list_physical_devices('GPU'))
print(tf.config.experimental.list_physical_devices('CPU'))
# Checking what my Strategy has picked up on - I haven't done anything for it to be more than 1
print(strategy.num_replicas_in_sync)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
1


In [124]:
# scale and pre-batch the dataset according to replica availability

BUFFER_SIZE = 10000

BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

def scale(image, label):
  # Note that the signature of the scale method is different if you don't load with as_supervised=True
  # Instead of 2 args (feature, label) it comes as a single dict you need to manipulate yourself
  image = tf.cast(image, tf.float32)
  image /= 255
  return image, label

# Note this shuffles within the allocated buffersize
train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

In [125]:
# Define the model architecture
def create_model(strategy: tf.distribute.MirroredStrategy) -> tf.keras.Sequential:
    with strategy.scope():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10)
        ])
        model.compile(
            # I made the mistake of setting my own learning rate for Adam - that performed TERRIBLY
            # worth exploring more but seems Adam's default is desirable for this
            optimizer=Adam(),
            # I forgot from_logits first
            # We either need the last layer to be a softmax (?) or from_logits to be true
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])
        return model
    
model = create_model(strategy)

In [126]:
# save dir for model checkpoints
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# Example of decaying learning rate -- optimizer should take care of this ? but interesting to see
def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5
  
class PrintLR(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    # should I be referencing the global model here? Should look into how callbacks can reference model object better
    print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
                                                      model.optimizer.lr.numpy()))

# Defining all the callbacks to hook into the model.fit step
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                       save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay),
    PrintLR()
]

In [127]:
# Train the model with the callback hooks passed in
EPOCHS=5
history = model.fit(train_dataset, epochs=EPOCHS, callbacks=callbacks)

Epoch 1/5
Learning rate for epoch 1 is 0.0010000000474974513
Epoch 2/5
Learning rate for epoch 2 is 0.0010000000474974513
Epoch 3/5
Learning rate for epoch 3 is 0.0010000000474974513
Epoch 4/5
Learning rate for epoch 4 is 9.999999747378752e-05
Epoch 5/5
Learning rate for epoch 5 is 9.999999747378752e-05


### Load the model weights from the last checkpoint and run on test dataset

In [128]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

eval_loss, eval_acc = model.evaluate(eval_dataset)

print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))

Eval loss: 0.03820014372467995, Eval Accuracy: 0.9857000708580017


### Export the model in SavedModel format

In [129]:
model.save(os.path.join('./model_location', 'my_model'))



INFO:tensorflow:Assets written to: ./model_location/my_model/assets


INFO:tensorflow:Assets written to: ./model_location/my_model/assets


### Try loading the model from the SavedModel format & predicting with it

In [130]:
loaded_model = tf.keras.models.load_model(os.path.join('./model_location', 'my_model'))

loaded_model.predict

<bound method Model.predict of <keras.engine.sequential.Sequential object at 0x2c730cfd0>>

In [131]:
# the STRUGGLE to get a single example as tf.Tensor from PrefetchDataset object haha
test_data = datasets[0]['test']
type(test_data)
foo = list(test_data)

import numpy as np
record_at_index = 49
record: tf.Tensor = foo[record_at_index][0]
# Ok now we have a tf.Tensor with shape=(28, 28, 1) but with dtype uint8
print(record.get_shape())
print(record.dtype)
# Cast dtype, note that it's not in-place
record = tf.cast(record, dtype=tf.float32)
print(record.dtype)
print(record.get_shape())
# Nice! But now the predict wants this in (n, 28, 28, 1) shape so let's figure out how to nest this once
bar = record.numpy()
foo = np.array([bar])
print(foo.shape) # Nice, managed to nest it
# predict expects a (n, 28, 28, 1) shape, where n is 1 when predicting on 1 example
result = loaded_model.predict(foo)
result

(28, 28, 1)
<dtype: 'uint8'>
<dtype: 'float32'>
(28, 28, 1)
(1, 28, 28, 1)


array([[ -513.6899 ,  2027.4171 ,  -295.14014, -1389.0393 ,  -828.27435,
        -1820.869  ,  -871.8772 ,  -949.46375,  1239.144  , -1507.3413 ]],
      dtype=float32)

In [132]:
# Let's take a look at what the input data looked like, hopefully looks aligned with the prediction 

import pandas as pd
pd.options.display.max_rows = 30
pd.options.display.max_columns = 30
pd.options.display.float_format = "{:.1f}".format

# drop from (28, 28, 1) to (28, 28) so we can fit it in a dataframe
baz = np.squeeze(bar) 
beep = pd.DataFrame(baz)
beep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,218.0,253.0,170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.0,247.0,252.0,252.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,212.0,252.0,246.0,132.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144.0,253.0,252.0,215.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,191.0,253.0,252.0,153.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,181.0,252.0,253.0,231.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
