### AlexNet

In [1]:
import tensorflow as tf
from utils import util_functions as utils
from d2l import tensorflow as d2l

In [2]:
class AlexNet(utils.Classifier):
  def __init__(self, lr=0.1, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=96, kernel_size=11, strides=4, activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Conv2D(filters=256, kernel_size=5, activation='relu', padding='same'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Conv2D(filters=384, kernel_size=3, activation='relu', padding='same'),
      tf.keras.layers.Conv2D(filters=384, kernel_size=3, activation='relu', padding='same'),
      tf.keras.layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(units=4096, activation='relu'),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(units=4096, activation='relu'),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(units=num_classes)
    ])

In [2]:
class AlexNet(d2l.Classifier):
  def __init__(self, lr=0.1, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=96, kernel_size=11, strides=4, activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Conv2D(filters=256, kernel_size=5, padding='same', activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Conv2D(filters=384, kernel_size=3, padding='same', activation='relu'),
      tf.keras.layers.Conv2D(filters=384, kernel_size=3, padding='same', activation='relu'),
      tf.keras.layers.Conv2D(filters=256, kernel_size=3, padding='same', activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(4096, activation='relu'),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(4096, activation='relu'),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(num_classes)
    ])

In [None]:
AlexNet().layer_summary(X_shape=(1, 224, 224, 1))

In [None]:
with tf.device('/CPU:0'):
  data = utils.FashionMNISTData(batch_size=128, resize=(224, 224))
  trainer = utils.Trainer(max_epochs=3)
  model = AlexNet(lr=0.1, num_classes=10)
  trainer.fit(model, data)

In [None]:
model.board.data['train_loss']

### VGG Net

In [8]:
def vgg_block(num_convs, num_channels):
  blk = tf.keras.models.Sequential()
  for _ in range(num_convs):
    blk.add(tf.keras.layers.Conv2D(filters=num_channels, kernel_size=3, padding='same', activation='relu'))
  blk.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
  return blk

In [9]:
class VGGNet(utils.Classifier):
  def __init__(self, arch, lr=0.1, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential()
    for (num_convs, num_channels) in arch:
      self.net.add(vgg_block(num_convs, num_channels))
    self.net.add(
      tf.keras.models.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(4096, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(4096, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(num_classes)
      ])
    )

In [None]:
VGGNet(arch=((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))).layer_summary(X_shape=(1, 224, 224, 1))

In [None]:
with tf.device('/CPU:0'):
  data = utils.FashionMNISTData(batch_size=128, resize=(224, 224))
  trainer = utils.Trainer(max_epochs=10)
  model = VGGNet(arch=((1, 16), (1, 32), (2, 64), (2, 128), (2, 128)))
  trainer.fit(model, data)

### Network in Network Architecture

In [13]:
def nin_block(out_channels, kernel_size, strides, padding):
  return tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=out_channels, kernel_size=kernel_size, strides=strides, padding=padding, activation='relu'),
    tf.keras.layers.Conv2D(filters=out_channels, kernel_size=1, activation='relu'),
    tf.keras.layers.Conv2D(filters=out_channels, kernel_size=1, activation='relu'),
  ])

In [14]:
class NiN(utils.Classifier):
  def __init__(self, lr=0.1, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential([
      nin_block(out_channels=96, kernel_size=11, strides=4, padding='valid'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      nin_block(out_channels=256, kernel_size=5, strides=1, padding='same'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      nin_block(out_channels=384, kernel_size=3, strides=1, padding='same'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2),
      tf.keras.layers.Dropout(0.5),
      nin_block(out_channels=num_classes, kernel_size=3, strides=1, padding='same'),
      tf.keras.layers.GlobalAvgPool2D(),
      tf.keras.layers.Flatten()
    ])

In [None]:
NiN().layer_summary(X_shape=(1, 224, 224, 1))

In [None]:
with tf.device('/CPU:0'):
  data = utils.FashionMNISTData(batch_size=128, resize=(224, 224))
  trainer = utils.Trainer(max_epochs=10)
  model = NiN()
  trainer.fit(model, data)

### GoogleNet (Inception Network)

In [17]:
class Inception(tf.keras.Model):
  def __init__(self, c1, c2, c3, c4):
    super().__init__()
    self.b1_1 = tf.keras.layers.Conv2D(filters=c1, kernel_size=1, activation='relu')

    self.b2_1 = tf.keras.layers.Conv2D(filters=c2[0], kernel_size=1, activation='relu')
    self.b2_2 = tf.keras.layers.Conv2D(filters=c2[1], kernel_size=3, activation='relu', padding='same')

    self.b3_1 = tf.keras.layers.Conv2D(filters=c3[0], kernel_size=1, activation='relu')
    self.b3_2 = tf.keras.layers.Conv2D(filters=c3[1], kernel_size=5, activation='relu', padding='same')

    self.b4_1 = tf.keras.layers.MaxPool2D(pool_size=3, strides=1, padding='same')
    self.b4_2 = tf.keras.layers.Conv2D(filters=c4, kernel_size=1, activation='relu')

  def call(self, x):
    b1 = self.b1_1(x)
    b2 = self.b2_2(self.b2_1(x))
    b3 = self.b3_2(self.b3_1(x))
    b4 = self.b4_2(self.b4_1(x))
    return tf.keras.layers.Concatenate()([b1, b2, b3, b4])

In [21]:
class GoogleNet(utils.Classifier):
  def b1(self):
    return tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=64, kernel_size=7, strides=2, padding='same', activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same') ## dim/4, dim/4, 64
    ])
  
  def b2(self):
    return tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=64, kernel_size=1, activation='relu'),
      tf.keras.layers.Conv2D(filters=192, kernel_size=3, padding='same', activation='relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same') ## dim/8, dim/8, 192
    ])
  
  def b3(self):
    return tf.keras.models.Sequential([
      Inception(c1=64, c2=(96, 128), c3=(16, 32), c4=32), ## Output Channels: 64 + 128 + 32 + 32
      Inception(c1=128, c2=(128, 192), c3=(32, 96), c4=64), ## Output Channels: 128 + 192 + 96 + 64
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same') ## dim/16, dim/16, 480
    ])
  
  def b4(self):
    return tf.keras.models.Sequential([
      Inception(c1=192, c2=(96, 208), c3=(16, 48), c4=64), 
      Inception(c1=160, c2=(112, 224), c3=(24, 64), c4=64),
      Inception(c1=128, c2=(128, 256), c3=(24, 64), c4=64),
      Inception(c1=112, c2=(144, 288), c3=(32, 64), c4=64),
      Inception(c1=256, c2=(160, 320), c3=(32, 128), c4=128),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same') ## dim/32, dim/32, 832
    ])
  
  def b5(self):
    return tf.keras.models.Sequential([
      Inception(c1=256, c2=(160, 320), c3=(24, 128), c4=128),
      Inception(c1=384, c2=(192, 384), c3=(48, 128), c4=128), ## dim/32, dim/32, 1024
      tf.keras.layers.GlobalAvgPool2D(), ## 1024
      tf.keras.layers.Flatten()
    ])
  
  def __init__(self, lr=0.1, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.Sequential([
      self.b1(), self.b2(), self.b3(), self.b4(), self.b5(),
      tf.keras.layers.Dense(num_classes)
    ])

In [None]:
model = GoogleNet().layer_summary((1, 96, 96, 1))

### Batch Norm and Layer Norm

Batch Norm
  - improve convergence of NNs
  - Batch normalization is applied to individual layers, or optionally, to all of them: In each training iteration, we first normalize the inputs (of batch normalization) by subtracting their mean and dividing by their standard deviation, where both are estimated based on the statistics of the current minibatch. Next, we apply a scale coefficient and an offset to recover the lost degrees of freedom. It is precisely due to this normalization based on batch statistics that batch normalization derives its name.
  - once the model is trained, we can calculate the means and variances of each layer’s variables based on the entire dataset. Indeed this is standard practice for models employing batch normalization; thus batch normalization layers function differently in training mode (normalizing by minibatch statistics) than in prediction mode (normalizing by dataset statistics). In this form they closely resemble the behavior of dropout regularization of Section 5.6, where noise is only injected during training
  - Dense Layers: Wx + b has dims (n_samples, n_units)
    Compute mean and variance dims (1, n_units)
  - Conv Layers: Output of Conv Layer (n_samples, n_width, n_height, n_channels)
    Compute mean and variance over n_samples, n_width & n_height: (1, 1, 1, n_channels)
  
Layer Norm:
  - normalization within an example
  - Dense Layers: Wx + b has dims (n_samples, n_units)
    Compute mean and variance dims across n_units (n_samples, 1)
  - Conv Layers: Conv Layer (n_samples, n_width, n_height, n_channels)
    Compute mean and variance over n_width & n_height: (n_samples, 1, 1, n_channels)

Implementation Details:
Putting aside the algorithmic details, note the design pattern underlying our implementation of the layer. Typically, we define the mathematics in a separate function, say batch_norm. We then integrate this functionality into a custom layer, whose code mostly addresses bookkeeping matters, such as moving data to the right device context, allocating and initializing any required variables, keeping track of moving averages (here for mean and variance), and so on. This pattern enables a clean separation of mathematics from boilerplate code. Also note that for the sake of convenience we did not worry about automatically inferring the input shape here; thus we need to specify the number of features throughout. By now all modern deep learning frameworks offer automatic detection of size and shape in the high-level batch normalization APIs (in practice we will use this instead).

In [2]:
import tensorflow as tf
from utils import util_functions as utils

In [3]:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps):
  inv = tf.cast(tf.math.rsqrt(x=moving_var + eps), dtype=X.dtype)
  inv *= gamma
  Y = inv * (X - moving_mean) + beta
  return Y

In [4]:
class BatchNorm(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(BatchNorm, self).__init__(**kwargs)

  def build(self, input_shape):
    weight_shape = [input_shape[-1], ]
    self.gamma = self.add_weight(name='gamma', shape=weight_shape, initializer=tf.ones_initializer(), trainable=True)
    self.beta = self.add_weight(name='beta', shape=weight_shape, initializer=tf.zeros_initializer(), trainable=True)
    self.moving_mean = self.add_weight(name='moving_mean', shape=weight_shape, initializer=tf.zeros_initializer(), trainable=False)
    self.moving_var = self.add_weight(name='moving_var', shape=weight_shape, initializer=tf.ones_initializer(), trainable=False)
    super(BatchNorm, self).build(input_shape)

  def assign_moving_average(self, variable, value):
    momentum = 0.1
    delta = (1.0 - momentum) * variable + momentum * value
    return variable.assign(delta)
  
  @tf.function
  def call(self, inputs, training):
    if training:
      axis = list(range(len(inputs) - 1))
      batch_mean = tf.reduce_mean(input_tensor=inputs, axis=axis, keepdims=True)
      batch_var = tf.reduce_mean(tf.math.squared_difference(inputs, tf.stop_gradient(batch_mean)), axis=axis, keepdims=True)
      mean_update = self.assign_moving_average(self.moving_mean, batch_mean)
      variance_update = self.assign_moving_average(self.moving_var, value=batch_var)
      self.add_update(mean_update)
      self.add_update(variance_update)
      mean, variance = batch_mean, batch_var
    else:
      mean, variance = self.moving_mean, self.moving_var

    output = batch_norm(X=inputs, gamma=self.gamma, beta=self.beta, moving_mean=mean, moving_var=variance, eps=1e-5)
    return output

In [8]:
class LeNet(utils.Classifier):
  def __init__(self, lr=0.01, num_classes=10):
    super().__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=6, kernel_size=5, padding="same", strides=1),
      BatchNorm(),
      tf.keras.layers.Activation('relu'),
      ## Output: (28, 28, 6) Params: (5, 5, 3, 6) + 6
      tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
      ## Output: (14, 14, 6)
      tf.keras.layers.Conv2D(filters=16, kernel_size=5, padding="valid", strides=1),
      BatchNorm(),
      tf.keras.layers.Activation('relu'),
      ## Output: (10, 10, 16) Params: (5, 5, 6, 16) + 16
      tf.keras.layers.AvgPool2D(pool_size=2, strides=2),
      ## Output: (5, 5, 16)
      tf.keras.layers.Flatten(),
      ## Output: (400)
      tf.keras.layers.Dense(units=120),
      BatchNorm(),
      tf.keras.layers.Activation('relu'),
      ## Output: (120) Params: (400, 120) + 120
      tf.keras.layers.Dense(units=84),
      BatchNorm(),
      tf.keras.layers.Activation('relu'),
      ## Output: (84) Params: (120, 84) + 84
      tf.keras.layers.Dense(units=10),
      ## Output: (10) Params: (84, 10) + 10
    ])

In [None]:
model = LeNet(lr=0.1)
model.layer_summary(X_shape=(1, 28, 28, 1))

In [None]:
with tf.device('/CPU:0'):
  data = utils.FashionMNISTData(batch_size=128, resize=(28, 28))
  trainer = utils.Trainer(max_epochs=10)
  model = LeNet(lr=0.1)
  trainer.fit(model, data)

### ResNet Models

In [27]:
class Residual(tf.keras.Model):
  def __init__(self, num_channels, use_1x1conv=False, strides=1):
    super().__init__()
    self.conv1 = tf.keras.layers.Conv2D(filters=num_channels, kernel_size=3, padding='same', strides=strides)
    self.conv2 = tf.keras.layers.Conv2D(filters=num_channels, kernel_size=3, padding='same', strides=1)
    self.conv3 = None
    if use_1x1conv:
      self.conv3 = tf.keras.layers.Conv2D(filters=num_channels, kernel_size=1, padding='same', strides=strides)
    
    self.bn1 = tf.keras.layers.BatchNormalization()
    self.bn2 = tf.keras.layers.BatchNormalization()
  
  def call(self, X):
    Y = tf.keras.activations.relu(self.bn1(self.conv1(X)))
    Y = self.bn2(self.conv2(Y))
    if self.conv3 is not None:
      X = self.conv3(X)
    Y += X
    return tf.keras.activations.relu(Y)

In [None]:
X = tf.random.normal(shape=(4, 6, 6, 3))
blk = Residual(6, use_1x1conv=True, strides=2)
Y = blk(X)
Y.shape

In [32]:
class ResNet(utils.Classifier):
  def b1(self):
    return tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=64, kernel_size=7, strides=2, padding='same'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.Activation('relu'),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
    ])
  
  def block(self, num_residuals, num_channels, first_block=False):
    blk = tf.keras.models.Sequential()
    for i in range(num_residuals):
      if i == 0 and not first_block:
        blk.add(Residual(num_channels=num_channels, use_1x1conv=True, strides=2))
      else:
        blk.add(Residual(num_channels=num_channels))
    return blk

  def __init__(self, arch, lr=0.1, num_classes=10):
    super(ResNet, self).__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential(self.b1())
    for i, b in enumerate(arch):
      num_residual, num_channels = b
      self.net.add(self.block(num_residuals=num_residual, num_channels=num_channels, first_block=(i==0)))
    self.net.add(tf.keras.models.Sequential([
      tf.keras.layers.GlobalAvgPool2D(),
      tf.keras.layers.Dense(units=num_classes)
    ]))

In [33]:
class ResNet18(ResNet):
  def __init__(self, lr=0.1, num_classes=10):
    super().__init__(arch=((2, 64), (2, 128), (2, 256), (2, 512)), lr=lr, num_classes=num_classes)

In [None]:
ResNet18().layer_summary(X_shape=(1, 224, 224, 1))

In [None]:
for i, b in enumerate(((2, 64), (2, 128), (2, 256), (2, 512))):
  print(i)
  num_residual, num_channels = b
  print(num_residual, num_channels)

### ResNexT

### DenseNet

In [1]:
import tensorflow as tf
from utils import util_functions as utils

In [6]:
class ConvBlock(tf.keras.layers.Layer):
  def __init__(self, num_channels):
    super(ConvBlock, self).__init__()
    self.bn = tf.keras.layers.BatchNormalization()
    self.relu = tf.keras.layers.ReLU()
    self.conv = tf.keras.layers.Conv2D(filters=num_channels, kernel_size=(3, 3), padding='same')
    self.listLayers = [self.bn, self.relu, self.conv]

  def call(self, x):
    y = x
    for layer in self.listLayers.layers:
      y = layer(y)
    y = tf.keras.layers.concatenate([x, y], axis=-1)
    return y

In [7]:
class DenseBlock(tf.keras.layers.Layer):
  def __init__(self, num_convs, num_channels):
    super(DenseBlock, self).__init__()
    self.listLayers = []
    for _ in range(num_convs):
      self.listLayers.append(ConvBlock(num_channels=num_channels))

  def call(self, x):
    for layer in self.listLayers.layers:
      x = layer(x)
    return x

In [8]:
blk = DenseBlock(num_convs=2, num_channels=10)
X = tf.random.uniform(shape=(4, 8, 8, 3))
Y = blk(X)
Y.shape

TensorShape([4, 8, 8, 23])

In [9]:
class TransitionBlock(tf.keras.layers.Layer):
  def __init__(self, num_channels, **kwargs):
    super(TransitionBlock, self).__init__(**kwargs)
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.relu = tf.keras.layers.ReLU()
    self.conv = tf.keras.layers.Conv2D(filters=num_channels, kernel_size=1)
    self.avg_pool = tf.keras.layers.AvgPool2D(pool_size=2, strides=2)
  
  def call(self, x):
    x = self.batch_norm(x)
    x = self.relu(x)
    x = self.conv(x)
    return self.avg_pool(x)

In [10]:
blk = TransitionBlock(num_channels=10)
X = tf.random.uniform(shape=(4, 8, 8, 3))
Y = blk(X)
Y.shape

TensorShape([4, 4, 4, 10])

In [13]:
class DenseNet(utils.Classifier):
  def b1(self):
    return tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(filters=64, kernel_size=7, strides=2, padding='same'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.ReLU(),
      tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
    ])
  
  def __init__(self, num_channels=64, growth_rate=32, arch=(4, 4, 4, 4), lr=0.1, num_classes=10):
    super(DenseNet, self).__init__()
    self.save_hyperparameters()
    self.net = tf.keras.models.Sequential(self.b1())
    for i, num_convs in enumerate(arch):
      self.net.add(DenseBlock(num_convs=num_convs, num_channels=growth_rate))
      num_channels += num_convs*growth_rate
      if i == len(arch) - 1:
        continue
      else:
        num_channels //=2
        self.net.add(TransitionBlock(num_channels=num_channels))
    
    self.net.add(tf.keras.models.Sequential([
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.ReLU(),
      tf.keras.layers.GlobalAvgPool2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(units=num_classes)
    ]))

In [14]:
DenseNet().layer_summary(X_shape=(1, 224, 224, 3))

Sequential output shape:	 (1, 56, 56, 64)
DenseBlock output shape:	 (1, 56, 56, 192)
TransitionBlock output shape:	 (1, 28, 28, 96)
DenseBlock output shape:	 (1, 28, 28, 224)
TransitionBlock output shape:	 (1, 14, 14, 112)
DenseBlock output shape:	 (1, 14, 14, 240)
TransitionBlock output shape:	 (1, 7, 7, 120)
DenseBlock output shape:	 (1, 7, 7, 248)
Sequential output shape:	 (1, 10)
