# Regularization: Avoiding overfitting

This session investigates different methods of regularization. There are many more methods than what I show here, but these are common/classical methods that come to mind when trying to make a model generalize better without acquiring additional training data or undertaking pre-training (these are preferable options if possible). The general aim is to obtain models that generalize better to data not used in training. It is a topic of central importance because neural networks tend to overfit very much and typically need large amounts of data to generalize.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import keras_cv # install keras_cv with pip for data augmentation (soon to be integrated into keras core)

# Load cifar10 data

We load the cifar 10 data set and map some functions to its elements such that the labels are float32 one-hot vectors and the images are normalized to have float32 values between 0 and 1. We include the option for data augmentation, which is only applied to the training data.

In [None]:
def get_cifar10(batch_size, augmentation_model=None):
    """
    Load and prepare CIFAR-10 as a tensorflow dataset.
    Returns a train and a validation dataset.
    Args:
    batch_size (int)
    """
    train_ds, val_ds = tfds.load('cifar10', split=['train', 'test'], shuffle_files=True)

    one_hot = lambda x: tf.one_hot(x, 10)

    map_func = lambda x,y: (tf.cast(x, dtype=tf.float32)/255.,
                            tf.cast(one_hot(y),tf.float32))

    map_func_2 = lambda x: (x["image"],x["label"])

    train_ds = train_ds.map(map_func_2).map(map_func).cache()
    val_ds   = val_ds.map(map_func_2).map(map_func).cache()
    
    train_ds = train_ds.shuffle(4096).batch(batch_size)
    val_ds   = val_ds.shuffle(4096).batch(batch_size)
    if augmentation_model:
        train_ds = train_ds.map(lambda x,y : (augmentation_model(x), y), num_parallel_calls=tf.data.AUTOTUNE)

    return (train_ds.prefetch(tf.data.AUTOTUNE), val_ds.prefetch(tf.data.AUTOTUNE))

# Define Keras model with options for regularization

- Dropout layers randomly drop certain units in their input but keep the statistics constant


- Batch normalization or other normalization rescale the input (in the case of batch norm, according to statistics over the batch dimension)


- L1 and L2 regularization add a penalty to the loss function for high parameter values (usually a sign of overfitting, which is why we track the Frobenius norm of the weights, which is the Euclidean norm for matrices)

In [None]:
class ConvModel(tf.keras.Model):
    def __init__(self, L2_reg=0, dropout_rate=0, batch_norm=False):
        super().__init__()
        
        kernel_regularizer=tf.keras.regularizers.L2(L2_reg) if L2_reg else None
        self.dropout_rate = dropout_rate
        if self.dropout_rate:
            self.dropout_layer = tf.keras.layers.Dropout(dropout_rate)
            
        self.layer_list = [tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer), 
            tf.keras.layers.Conv2D(32, 3,activation="relu", kernel_regularizer=kernel_regularizer),
            tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer),
            tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(10, activation="softmax", kernel_regularizer=kernel_regularizer)]
        if batch_norm:
            
                self.layer_list = [tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer), 
                                   tf.keras.layers.BatchNormalization(),
                                    tf.keras.layers.Conv2D(32, 3,activation="relu", kernel_regularizer=kernel_regularizer),
                                   tf.keras.layers.BatchNormalization(),
                                    tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer),
                                   tf.keras.layers.BatchNormalization(),
                                    tf.keras.layers.Conv2D(32, 3, activation="relu", kernel_regularizer=kernel_regularizer),
                                   tf.keras.layers.BatchNormalization(),
                                    tf.keras.layers.Flatten(),
                                   #tf.keras.layers.BatchNormalization(),
                                    tf.keras.layers.Dense(10, activation="softmax", kernel_regularizer=kernel_regularizer)]
        # metrics to update
        self.frobenius_metric = tf.keras.metrics.Mean(name="total_frobenius_norm")
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        self.accuracy_metric = tf.keras.metrics.CategoricalAccuracy(name="accuracy")
        
    def call(self, x, training=False):
        for layer in self.layer_list[:-1]:
            x = layer(x, training=training)
            if self.dropout_rate:
                x = self.dropout_layer(x, training)
        
        return self.layer_list[-1](x)
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()
            
    def compute_frobenius(self):
        frobenius_norm = tf.zeros((1,))
        for var in self.trainable_variables:
            frobenius_norm += tf.norm(var, ord="euclidean")
        return frobenius_norm
    
    @tf.function
    def train_step(self, data):
        x, target = data
        with tf.GradientTape() as tape:
            prediction = self(x, training=True)
            loss = self.compiled_loss(target, prediction, regularization_losses=self.losses)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        self.frobenius_metric.update_state(self.compute_frobenius())
        self.loss_metric.update_state(loss)
        self.accuracy_metric.update_state(target, prediction)
        
        return {metric.name: metric.result() for metric in self.metrics}
    
    @tf.function
    def test_step(self, data):
        x, target = data
        prediction = self(x, training=False)
        loss = self.compiled_loss(target, prediction, regularization_losses=self.losses)
        
        self.frobenius_metric.update_state(self.compute_frobenius())
        self.loss_metric.update_state(loss)
        self.accuracy_metric.update_state(target, prediction)
        
        return {metric.name: metric.result() for metric in self.metrics}

## Training the model without any extra regularization

In [None]:
train_ds, val_ds = get_cifar10(32)
model = ConvModel()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer="adam")
history_of = model.fit(train_ds, validation_data=val_ds, epochs=25)

plt.plot(history_of.history["total_frobenius_norm"]/np.max(history_of.history["total_frobenius_norm"]) * np.max(history_of.history["val_loss"]))
plt.plot(history_of.history["val_loss"])
plt.plot(history_of.history["loss"])
plt.legend(labels=["Total Frobenius Norm", "Validation Loss", "Loss"])
plt.savefig("convnet_no_reg.svg")
plt.show()

We see that the model overfits the data, that is, the performance on the training set gets increasingly better than on the validation set. Together with this, the total norm of the weights keeps increasing - a sign of memorization. Regularization methods can either directly address this by adding a penalty to the loss function for high parameter coefficients, or indirectly by adding noise somewhere in learning (e.g. dropout, batch norm, data augmentation).

## Training the same model with L2 regularization

L2 regularization adds a fraction of the euclidean norm of the weight matrices to the loss function, such that lower weight values lead to a lower loss, disencouraging memorization.

In [None]:
train_ds, val_ds = get_cifar10(32)
model = ConvModel(L2_reg=0.001)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer="adam")
history_reg = model.fit(train_ds, validation_data=val_ds, epochs=25)

plt.plot(history_reg.history["total_frobenius_norm"]/np.max(history_reg.history["total_frobenius_norm"]) * np.max(history_reg.history["val_loss"]))
plt.plot(history_reg.history["val_loss"])
plt.plot(history_reg.history["loss"])
plt.legend(labels=["Total Frobenius Norm", "Validation Loss", "Loss"])
plt.savefig("convnet_l2_reg.svg")
plt.show()

## Training the same model with only data augmentation

There are many different kinds of data augmentation: random cropping, randomly shifting the hue, rotating, zooming, flipping axes, cutting up images and mixing them across a batch, adding noise, introducing jpeg artifacts of different sorts, changing the contrast and brightness, etc. etc.

Just as having a larger data set leads to better generalization, data augmentation does the same, albeit to a lesser degree.

In [None]:
augmentation_model = tf.keras.Sequential([keras_cv.layers.RandAugment(value_range=[0,1],magnitude=0.1)])

In [None]:
train_ds, val_ds = get_cifar10(32, augmentation=augmentation_model)
model = ConvModel()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer="adam")
history_augment = model.fit(train_ds, validation_data=val_ds, epochs=25)

plt.plot(history_augment.history["total_frobenius_norm"]/np.max(history_augment.history["total_frobenius_norm"]) * np.max(history_augment.history["val_loss"]))
plt.plot(history_augment.history["val_loss"])
plt.plot(history_augment.history["loss"])
plt.legend(labels=["Total Frobenius Norm", "Validation Loss", "Loss"])
plt.savefig("convnet_augment_reg.svg")
plt.show()

## Training the same model with only dropout between layers

Dropout drops activation values in the preceding layer, preventing over-reliance on specific individual units, which is sometimes thought to be a hallmark of memorization.

In [None]:
train_ds, val_ds = get_cifar10(32, augmentation=None)
model = ConvModel(L2_reg=0, dropout_rate=0.5)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer="adam")
history_dropout = model.fit(train_ds, validation_data=val_ds, epochs=25)

plt.plot(history_dropout.history["total_frobenius_norm"]/np.max(history_dropout.history["total_frobenius_norm"]) * np.max(history_dropout.history["val_loss"]))
plt.plot(history_dropout.history["val_loss"])
plt.plot(history_dropout.history["loss"])
plt.legend(labels=["Total Frobenius Norm", "Validation Loss", "Loss"])
plt.savefig("convnet_dropout_reg.svg")
plt.show()