# Chapter 12: Custom Models and Trainig with TensorFlow

### 12. Implement a custom layer that performs layer normalization (we will use this type of layer in Chapter 15):

a. The `build()` method should define two trainable weights $\alpha$ and $\beta$, both of shape `input_shape[-1:]` and data type `tf.float32`. $\alpha$ should be initialized with 1s, and $\beta$ with 0s.

b. The `call()` method should compute the mean $\mu$ and standard deviation $\sigma$ of each instance’s features. For this, you can use `tf.nn.moments(inputs, axes=-1, keepdims=True)`, which returns the mean $\mu$ and the variance $\sigma^2$ of all instances (compute the square root of the variance to get the standard deviation). Then the function should compute and return $\alpha \otimes (X-\mu)/(\sigma+\epsilon) + \beta$, where $\otimes$ represents itemwise multiplication (*) and $\epsilon$ is a smoothing term (a small constant to avoid division by zero, e.g., 0.001).

c. Ensure that your custom layer produces the same (or very nearly the same) output as the `tf.keras.layers.LayerNormalization` layer.


In [1]:
import os

import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import (
    activations,
    datasets,
    layers,
    losses,
    metrics,
    optimizers,
    regularizers,
    Sequential,
)

In [2]:
class NormLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.epsilon = 1e-9

    def build(self, input_shape):
        self.alpha = self.add_weight(
            name="alpha", shape=input_shape[-1:], initializer="ones", dtype=tf.float32
        )
        self.beta = self.add_weight(
            name="beta", shape=input_shape[-1:], initializer="zeros", dtype=tf.float32
        )
    
    def call(self, x):
        mean, var = tf.nn.moments(x, axes=-1, keepdims=True)
        std_var = tf.sqrt(var)
        x_normalized = tf.math.multiply(self.alpha, (x - mean))/((std_var + self.epsilon) + self.beta)
        return x_normalized


In [3]:
x = [
    [40, 50, 55, 78, 345, 324, 22],
    [99, 22, 15, 88, 44, 5, 567],
    [4, 324, 2, 44, 77, 9234, 6],
]
x = tf.Variable(x, dtype=tf.float32)

normalizer = NormLayer()
x_normalized = normalizer(x)

norm_layer = layers.LayerNormalization()
x_normalized_keras = norm_layer(x)

x_normalized - x_normalized_keras

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



<tf.Tensor: shape=(3, 7), dtype=float32, numpy=
array([[-5.9604645e-08, -5.9604645e-08,  0.0000000e+00, -2.9802322e-08,
         0.0000000e+00,  1.1920929e-07,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00]], dtype=float32)>

*The difference between the two results is really low $10^{-8}$ or $10^{-7}$. We can say that the implementation of the layer has been successful. In cases where this difference may be a problem (i.e., when we need the maximum precision) it would be better to use the keras implementation.* 

### 13. Train a model using a custom training loop to tackle the Fashion MNIST dataset (see Chapter 10):

a. Display the epoch, iteration, mean training loss, and mean accuracy over each epoch (updated at each iteration), as well as the validation loss and accuracy at the end of each epoch.


b. Try using a different optimizer with a different learning rate for the upper layers and the lower layers.

*Let's first download the data of the **Fashion MNIST** dataset:*

In [4]:
gpus = len(tf.config.list_physical_devices("GPU"))
text_gpus = f"{gpus} GPUs" if gpus > 1 else f"{gpus} GPU"
device_name = tf.test.gpu_device_name()
if gpus > 0:
    print(f"{text_gpus} found at {device_name}")
    BATCH_SIZE = 1024
else:
    raise SystemError("No GPU found")
    BATCH_SIZE = 32

tf.random.set_seed(1992)

MODEL_PATH = os.path.join('..', '..', 'models', 'chapter_12')
os.makedirs(MODEL_PATH, exist_ok=True)

1 GPU found at /device:GPU:0


In [5]:
@tf.function
def random_batch(x, y, batch_size):
    idx = tf.random.uniform(shape=[batch_size,], maxval=tf.shape(x)[0], dtype=tf.int32)
    x_tf = tf.gather(x, indices=idx)
    y_tf = tf.gather(y, indices=idx)
    return x_tf, y_tf


def print_status_bar(step, total, loss, metrics=None):
    metrics = " - ".join([f"{m.name}: {m.result()}" for m in [loss] + (metrics or [])])
    end = "" if step < total else "\n"
    tf.print(f"\r{step}/{total} - " + metrics, end=end)


def training_loop(model, x_train, y_train, n_epochs, batch_size=BATCH_SIZE, training=True):
    
    n_steps = len(x_train)// batch_size
    optimizer_low = optimizers.Adam(learning_rate=1e-4)
    optimizer_up = optimizers.Nadam(learning_rate=3e-4)
    loss_fn = losses.sparse_categorical_crossentropy
    mean_loss = keras.metrics.Mean(name='mean_loss')
    metrics = [keras.metrics.SparseCategoricalAccuracy()]
    
    for epoch in range(1, n_epochs + 1):
        tf.print(f"Epoch {epoch}/{n_epochs}")
        for step in tf.range(1, n_steps + 1, 1):
            x_train_batch, y_train_batch = random_batch(x_train, y_train, batch_size)
            with tf.GradientTape() as tape:
                y_pred = model(x_train_batch, training=training)
                main_loss = tf.reduce_mean(loss_fn(y_train_batch, y_pred))
                loss = tf.add_n([main_loss] + model.losses)
            
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer_low.apply_gradients(zip(gradients, model.trainable_variables))
            mean_loss(loss)
            for metric in metrics:
                metric(y_train_batch, y_pred)

            print_status_bar(step, n_steps, mean_loss, metrics)
        
        for metric in [mean_loss] + metrics:
            metric.reset_states()
    
    return model

@tf.function
def load_fashion_mnist():
    (x_train, y_train), (x_test, y_test) = datasets.fashion_mnist.load_data()
    x_train, x_valid = x_train[:50000], x_train[50000:]
    y_train, y_valid = y_train[:50000], y_train[50000:]
    
    return {
        "x_train": x_train,
        "y_train": y_train,
        "x_valid": x_valid,
        "y_valid": y_valid,
        "x_test": x_test,
        "y_test": y_test,
    }

In [6]:
def training_model(n_epochs=10, callbacks=False):
    
    # obtain the data
    data = load_fashion_mnist()
    
    regularizer = regularizers.l1_l2()

    model_name = 'fashion_mnist_classifier'
    model = Sequential(
        [
            layers.Flatten(input_shape=(28, 28)),
            layers.BatchNormalization(),
        ],
        name=model_name
    )

    for i in tf.range(1, 5, 1):
        model.add(
            layers.Dense(
                100,
                kernel_initializer="he_normal",
                kernel_regularizer=regularizer,
            )
        )
        model.add(layers.BatchNormalization())
        model.add(layers.Activation("elu"))

    model.add(layers.Dense(10, activation="softmax"))
    
    model_trained = training_loop(model, data['x_train'], data['y_train'], n_epochs=n_epochs)
    
    

In [8]:
training_model(n_epochs=50)

Epoch 1/50
48/48 - mean_loss: 74.74301147460938 - sparse_categorical_accuracy: 0.44689941406256564
Epoch 2/50
48/48 - mean_loss: 69.30262756347656 - sparse_categorical_accuracy: 0.6154785156255916
Epoch 3/50
48/48 - mean_loss: 64.22531127929688 - sparse_categorical_accuracy: 0.6595458984375325
Epoch 4/50
48/48 - mean_loss: 59.453487396240234 - sparse_categorical_accuracy: 0.6914062525036628
Epoch 5/50
48/48 - mean_loss: 54.98728942871094 - sparse_categorical_accuracy: 0.71840417385101327
Epoch 6/50
48/48 - mean_loss: 50.82316589355469 - sparse_categorical_accuracy: 0.73406982421875962
Epoch 7/50
48/48 - mean_loss: 46.95559310913086 - sparse_categorical_accuracy: 0.75046795606613167
Epoch 8/50
48/48 - mean_loss: 43.37821960449219 - sparse_categorical_accuracy: 0.76430261135101321
Epoch 9/50
48/48 - mean_loss: 40.068580627441406 - sparse_categorical_accuracy: 0.7766317129135132
Epoch 10/50
48/48 - mean_loss: 37.022850036621094 - sparse_categorical_accuracy: 0.7848714590072632
Epoch 11/50