# Custom Layers and Models via Subclassing

In [1]:
# Import packages
import tensorflow as tf

from tensorflow.keras import layers

tf.__version__ # 2.x

'2.7.0'

## The Layer Class

One of the central abstraction in TF is the Layer class.
A layer encapsulates both a state (the layer's "weights") and a transformation from inputs to outputs (a "call", the layer's forward pass).

Weights are created using `Variable` method and then passing a initializer to it.

In [2]:
# Linear dense (without any activation) layer implementation
class Linear(tf.keras.layers.Layer):
    
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        # Initialize weights
        w_init = tf.random_uniform_initializer()
        self.w = tf.Variable(
            initial_value=w_init(shape=(input_dim, units), dtype="float32"), trainable=True,
        )
        # Initialize bias
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value=b_init(shape=(units,), dtype="float32"), trainable=True
        )
    
    def call(self, inputs):
        # Forward pass
        return tf.matmul(inputs, self.w) + self.b

In [3]:
Linear()

2021-12-05 09:26:55.649840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-05 09:26:55.653917: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-05 09:26:55.654380: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-05 09:26:55.655245: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

<__main__.Linear at 0x7fc1e1c6b160>

You would use a layer by calling it on some input tensor, much like a Python function.

In [4]:
linear_layer = Linear(input_dim=2)
y = linear_layer(tf.ones((2, 2)))

print("Output:", y)

Output: tf.Tensor(
[[ 0.03256869 -0.0267356   0.02597683 -0.05120901 -0.01899035  0.00940387
  -0.03205577  0.00081595  0.0530762  -0.04714317 -0.02021838  0.05250065
   0.06428746 -0.02179533 -0.07483177 -0.0186244   0.00369168  0.02791958
   0.00781361 -0.05093931 -0.0177935  -0.00920062 -0.07213768  0.03296357
   0.03095433  0.03066542 -0.02003314  0.02657256 -0.06937231  0.05224498
  -0.01098096 -0.03049957]
 [ 0.03256869 -0.0267356   0.02597683 -0.05120901 -0.01899035  0.00940387
  -0.03205577  0.00081595  0.0530762  -0.04714317 -0.02021838  0.05250065
   0.06428746 -0.02179533 -0.07483177 -0.0186244   0.00369168  0.02791958
   0.00781361 -0.05093931 -0.0177935  -0.00920062 -0.07213768  0.03296357
   0.03095433  0.03066542 -0.02003314  0.02657256 -0.06937231  0.05224498
  -0.01098096 -0.03049957]], shape=(2, 32), dtype=float32)


Better way to create weights for each layer is by using `add_weight`.

In [5]:
class Linear(tf.keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        # Add weights in constructor
        self.w = self.add_weight(
            shape=(input_dim, units), initializer=tf.random_uniform_initializer(), trainable=True
        )
        self.b = self.add_weight(shape=(units,), initializer=tf.zeros_initializer(), trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [6]:
Linear()

<__main__.Linear at 0x7fc1e1c18880>

In [7]:
linear_layer = Linear(input_dim=2)
y = linear_layer(tf.ones((2, 2)))

print("Output:", y)

Output: tf.Tensor(
[[ 0.01752564  0.0009395  -0.06829813  0.03295275  0.04466343  0.03496991
  -0.03903058  0.03270707 -0.0148867   0.03014386 -0.00034757 -0.04917326
   0.06051666 -0.01691652 -0.01125877 -0.03261441 -0.01653082  0.03802218
   0.00921635 -0.02513259 -0.00793837  0.07657116  0.02421763 -0.01144796
  -0.07792601 -0.06050552  0.06634013 -0.06091205  0.01640324 -0.00594087
  -0.0485903  -0.07270332]
 [ 0.01752564  0.0009395  -0.06829813  0.03295275  0.04466343  0.03496991
  -0.03903058  0.03270707 -0.0148867   0.03014386 -0.00034757 -0.04917326
   0.06051666 -0.01691652 -0.01125877 -0.03261441 -0.01653082  0.03802218
   0.00921635 -0.02513259 -0.00793837  0.07657116  0.02421763 -0.01144796
  -0.07792601 -0.06050552  0.06634013 -0.06091205  0.01640324 -0.00594087
  -0.0485903  -0.07270332]], shape=(2, 32), dtype=float32)


Layers can have non-trainable weights as well. Set `trainable` to False for such weights.

Such weights are ignored during backpropagation, when you are training the layer.

In [8]:
class ComputeSum(tf.keras.layers.Layer):
    def __init__(self, input_dim=32):
        super(ComputeSum, self).__init__()
        # Create non-trainable weights
        self.total = self.add_weight(
            shape=(input_dim,), initializer=tf.zeros_initializer(), trainable=False
        )
    
    def call(self, inputs):
        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
        return self.total

In [9]:
ComputeSum()

<__main__.ComputeSum at 0x7fc1e0196520>

In [10]:
sum_layer = ComputeSum(input_dim=2)
y = sum_layer(tf.ones((2, 2)))

print("Output:", y)

Output: <tf.Variable 'Variable:0' shape=(2,) dtype=float32, numpy=array([2., 2.], dtype=float32)>


Like trainable weights it is part of layer.weights, but it gets categorized as a non-trainable weight:

In [11]:
print("Weights:", sum_layer.weights)
print("Non-trainable weights:", sum_layer.non_trainable_weights)
print("Trainable_weights:", sum_layer.trainable_weights)

Weights: [<tf.Variable 'Variable:0' shape=(2,) dtype=float32, numpy=array([2., 2.], dtype=float32)>]
Non-trainable weights: [<tf.Variable 'Variable:0' shape=(2,) dtype=float32, numpy=array([2., 2.], dtype=float32)>]
Trainable_weights: []


In [12]:
print("Weights:", linear_layer.weights)
print("Non-trainable weights:", linear_layer.non_trainable_weights)
print("Trainable_weights:", linear_layer.trainable_weights)

Weights: [<tf.Variable 'Variable:0' shape=(2, 32) dtype=float32, numpy=
array([[ 0.03472043, -0.00285698, -0.04966266,  0.02834609,  0.01665084,
         0.01965741,  0.0091069 ,  0.01256463, -0.04892944,  0.04348368,
        -0.03333709, -0.0265271 ,  0.02079332, -0.02796942,  0.03265506,
        -0.00083668,  0.01536772,  0.01702425,  0.02468638, -0.00701565,
        -0.02936511,  0.04261743,  0.01108792, -0.03035083, -0.0371431 ,
        -0.02894319,  0.0247562 , -0.02849113,  0.03462094, -0.01732335,
        -0.01706024, -0.04409164],
       [-0.0171948 ,  0.00379648, -0.01863547,  0.00460666,  0.02801258,
         0.0153125 , -0.04813747,  0.02014244,  0.03404274, -0.01333982,
         0.03298953, -0.02264616,  0.03972334,  0.0110529 , -0.04391383,
        -0.03177773, -0.03189854,  0.02099793, -0.01547004, -0.01811694,
         0.02142674,  0.03395373,  0.01312971,  0.01890286, -0.04078291,
        -0.03156233,  0.04158392, -0.03242092, -0.01821771,  0.01138248,
        -0.031530

In many cases, you may not know in advance the size of your inputs, and you would like to lazily create weights when that value becomes known, some time after instantiating the layer.

Use `build(self, inputs_shape)` method in such scenarios.

In [13]:
class Linear(tf.keras.layers.Layer):
    def __init__(self, units=32):
        super(Linear, self).__init__()
        self.units = units

    def build(self, input_shape):
        # Initialize weights here rather than in __init__
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_uniform",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units,), initializer="random_uniform", trainable=False
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [14]:
Linear()

<__main__.Linear at 0x7fc1e1c18b50>

The __call__() method of your layer will automatically run build the first time it is called.

In [15]:
# At instantiation, we don't know on what inputs this is going to get called
linear_layer = Linear(units=32)

# The layer's weights are created dynamically the first time the layer is called
y = linear_layer(tf.ones((100, 10)))

print(y)

tf.Tensor(
[[ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]
 [ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]
 [ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]
 ...
 [ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]
 [ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]
 [ 0.01411444 -0.15406373 -0.15456314 ...  0.07136944 -0.05941238
  -0.08913658]], shape=(100, 32), dtype=float32)


Layers are recursively composable:

- If you assign a Layer instance as attribute of another Layer, the outer layer will start tracking the weights of the inner layer.

We recommend creating such sublayers in the __init__() method (since the sublayers will typically have a build method, they will be built when the outer layer gets built).

In [16]:
class MLPBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(MLPBlock, self).__init__()
        self.linear_1 = Linear(32)
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(1)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        y = self.linear_3(x)
        return y

In [17]:
perceptron = MLPBlock()
y = perceptron(tf.ones(shape=(100, 64)))

print("Output:", y.shape)
print("Weights:", perceptron.weights)

Output: (100, 1)
Weights: [<tf.Variable 'mlp_block/linear_6/Variable:0' shape=(64, 32) dtype=float32, numpy=
array([[ 0.00449582,  0.01607111,  0.04930012, ..., -0.04768664,
         0.02447351, -0.00308717],
       [ 0.00379281, -0.01250118,  0.01786853, ...,  0.00247435,
        -0.03157356, -0.03763156],
       [-0.02351732, -0.00710692,  0.02447944, ...,  0.01211698,
        -0.01160783, -0.02966465],
       ...,
       [-0.0346539 ,  0.03762254, -0.00535514, ...,  0.02984199,
        -0.01391277,  0.03444905],
       [ 0.02291944,  0.01971172, -0.0252885 , ..., -0.04969681,
        -0.03328296,  0.00262451],
       [ 0.04289018,  0.03272624,  0.04079911, ...,  0.01104084,
         0.02344097, -0.02211528]], dtype=float32)>, <tf.Variable 'mlp_block/linear_7/Variable:0' shape=(32, 32) dtype=float32, numpy=
array([[ 0.03645417,  0.00677035,  0.02738023, ...,  0.03061763,
        -0.04291555,  0.00419666],
       [ 0.0130839 ,  0.03364328,  0.04196881, ..., -0.00578443,
        -0.003

Layers developed in such fashion (via subclassing) are not serializable.

If you need your custom layers to be serializable, you can optionally implement a `get_config()` method.

In [18]:
class Linear(tf.keras.layers.Layer):
    def __init__(self, units=32, **kwargs):
        super(Linear, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units,), initializer="random_normal", trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

    def get_config(self):
        config = super(Linear, self).get_config()
        config.update({"units": self.units})
        return config

In [19]:
layer = Linear(64)
print("Layers:", layer)

config = layer.get_config()
print("Config:", config)

new_layer = Linear.from_config(config)
print("New Layer from config:", new_layer)

Layers: <__main__.Linear object at 0x7fc1e0139430>
Config: {'name': 'linear_9', 'trainable': True, 'dtype': 'float32', 'units': 64}
New Layer from config: <__main__.Linear object at 0x7fc1e0153a90>


### Differently Behaving Layers

Some layers, in particular the `BatchNormalization` layer and the `Dropout` layer, have different behaviors during training and inference.

For such layers, it is standard practice to expose a `training` (boolean) argument in the `call()` method.

In [20]:
# Sample dropout wrapper
class CustomDropout(tf.keras.layers.Layer):
    def __init__(self, rate, **kwargs):
        super(CustomDropout, self).__init__(**kwargs)
        self.rate = rate

    def call(self, inputs, training=None):
        if training:
            return tf.nn.dropout(inputs, rate=self.rate)
        return inputs

In [21]:
class MLPBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(MLPBlock, self).__init__()
        self.linear_1 = Linear(32)
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(1)
        self.dp = CustomDropout(rate=0.2)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        x = self.dp(x)
        y = self.linear_3(x)
        return y

In [22]:
perceptron = MLPBlock()
y = perceptron(tf.ones(shape=(100, 64)))

print("Weights:", perceptron.weights)

Weights: [<tf.Variable 'mlp_block_1/linear_10/Variable:0' shape=(64, 32) dtype=float32, numpy=
array([[ 0.02888012,  0.01941005,  0.03883037, ...,  0.00694392,
         0.05469983, -0.0375773 ],
       [ 0.03368874, -0.03961131,  0.01829086, ...,  0.0191736 ,
        -0.02835887, -0.03824697],
       [-0.0118189 , -0.03702451, -0.06405965, ..., -0.10803524,
        -0.08172522,  0.05049881],
       ...,
       [ 0.01918423,  0.00332862, -0.01296864, ...,  0.00072299,
        -0.01678587,  0.08928937],
       [ 0.06503984,  0.06318551, -0.09207523, ..., -0.03971914,
         0.06888562, -0.10040945],
       [-0.01152491, -0.02570651,  0.06771914, ..., -0.10177924,
         0.01783559, -0.03269385]], dtype=float32)>, <tf.Variable 'mlp_block_1/linear_10/Variable:0' shape=(32,) dtype=float32, numpy=
array([ 1.2617893e-02,  3.6020290e-02, -1.4786268e-02,  1.7590590e-02,
        7.1845882e-02, -4.8739787e-02, -5.6671735e-02, -2.9385079e-02,
        9.6995480e-02,  1.6119916e-02, -7.9049490e-

The other privileged argument supported by call() is the `mask_zero` argument.

TF automatically passes the correct boolean in suhch cases to __call__() for layers that support it.

## The Model Class

The Model class has the same API as Layer, with the following differences:

- It exposes built-in training, evaluation, and prediction loops (model.fit(), model.evaluate(), model.predict()).
- It exposes the list of its inner layers, via the model.layers property.
- It exposes saving and serialization APIs (save(), save_weights()...)

In general, you will use the Layer class to define inner computation blocks, and will use the Model class to define the outer model -- the object you will train.

In [23]:
class SampleModel(tf.keras.Model):
    
    def __init__(self, num_classes):
        super(SampleModel, self).__init__()
        self.b1 = MLPBlock()
        self.b2 = MLPBlock()
        self.classifier = tf.keras.layers.Dense(num_classes)
    
    def call(self, model_input):
        x = self.b1(model_input)
        x = self.b2(x)
        return self.classifier(x)

In [24]:
model = SampleModel(num_classes=2)
print(model)

<__main__.SampleModel object at 0x7fc1e0139670>


## Putting it all together: an end-to-end example

- A Layer encapsulate a state (created in __init__() or build()) and some computation (defined in call()).
- Layers can be recursively nested to create new, bigger computation blocks.
- Layers can create and track losses (typically regularization losses) as well as metrics, via add_loss() and add_metric()
- The outer container, the thing you want to train, is a Model. A Model is just like a Layer, but with added training and serialization utilities.

In [25]:
from tensorflow.keras import layers


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


class Encoder(layers.Layer):
    """Maps MNIST digits to a triplet (z_mean, z_log_var, z)."""

    def __init__(self, latent_dim=32, intermediate_dim=64, name="encoder", **kwargs):
        super(Encoder, self).__init__(name=name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation="relu")
        self.dense_mean = layers.Dense(latent_dim)
        self.dense_log_var = layers.Dense(latent_dim)
        self.sampling = Sampling()

    def call(self, inputs):
        x = self.dense_proj(inputs)
        z_mean = self.dense_mean(x)
        z_log_var = self.dense_log_var(x)
        z = self.sampling((z_mean, z_log_var))
        return z_mean, z_log_var, z


class Decoder(layers.Layer):
    """Converts z, the encoded digit vector, back into a readable digit."""

    def __init__(self, original_dim, intermediate_dim=64, name="decoder", **kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation="relu")
        self.dense_output = layers.Dense(original_dim, activation="sigmoid")

    def call(self, inputs):
        x = self.dense_proj(inputs)
        return self.dense_output(x)

In [26]:
class VariationalAutoEncoder(tf.keras.Model):
    """Combines the encoder and decoder into an end-to-end model for training."""

    def __init__(
        self,
        original_dim,
        intermediate_dim=64,
        latent_dim=32,
        name="autoencoder",
        **kwargs
    ):
        super(VariationalAutoEncoder, self).__init__(name=name, **kwargs)
        self.original_dim = original_dim
        self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim=intermediate_dim)
        self.decoder = Decoder(original_dim, intermediate_dim=intermediate_dim)

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Add KL divergence regularization loss.
        kl_loss = -0.5 * tf.reduce_mean(
            z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
        )
        self.add_loss(kl_loss)
        return reconstructed

In [27]:
# Simple training loop on MNIST data
original_dim = 784
vae = VariationalAutoEncoder(original_dim, 64, 32)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
mse_loss_fn = tf.keras.losses.MeanSquaredError()

loss_metric = tf.keras.metrics.Mean()

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784).astype("float32") / 255

train_dataset = tf.data.Dataset.from_tensor_slices(x_train)
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)

epochs = 2

# Iterate over epochs.
for epoch in range(1, epochs+1):
    print("Start of epoch %d" % (epoch,))

    # Iterate over the batches of the dataset.
    for step, x_batch_train in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            reconstructed = vae(x_batch_train)
            # Compute reconstruction loss
            loss = mse_loss_fn(x_batch_train, reconstructed)
            loss += sum(vae.losses)  # Add KLD regularization loss

        grads = tape.gradient(loss, vae.trainable_weights)
        optimizer.apply_gradients(zip(grads, vae.trainable_weights))

        loss_metric(loss)

        if step % 100 == 0:
            print("step %d: mean loss = %.4f" % (step, loss_metric.result()))

Start of epoch 1
step 0: mean loss = 0.3538
step 100: mean loss = 0.1257
step 200: mean loss = 0.0992
step 300: mean loss = 0.0892
step 400: mean loss = 0.0843
step 500: mean loss = 0.0809
step 600: mean loss = 0.0788
step 700: mean loss = 0.0772
step 800: mean loss = 0.0760
step 900: mean loss = 0.0750
Start of epoch 2
step 0: mean loss = 0.0747
step 100: mean loss = 0.0740
step 200: mean loss = 0.0735
step 300: mean loss = 0.0731
step 400: mean loss = 0.0727
step 500: mean loss = 0.0723
step 600: mean loss = 0.0720
step 700: mean loss = 0.0717
step 800: mean loss = 0.0715
step 900: mean loss = 0.0712


Also, it can be trained using the built-in training loops like any other model.

In [28]:
vae = VariationalAutoEncoder(784, 64, 32)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

vae.compile(optimizer, loss=tf.keras.losses.MeanSquaredError())
vae.fit(x_train, x_train, epochs=2, batch_size=64)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc1d00878e0>