# Using neural networks for L96 parameterization

In [None]:
%matplotlib inline
import math
from IPython.display import HTML, Image

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.data as Data
import torchvision
from torch import nn, optim
from torch.autograd import Variable

from L96_model import L96, RK2, RK4, EulerFwd, L96_eq1_xdot, integrate_L96_2t

In [None]:
# Ensuring reproducibility
np.random.seed(14)
torch.manual_seed(14);

To recap, [Lorenz (1996)](https://www.ecmwf.int/en/elibrary/10829-predictability-problem-partly-solved) describes a two-time scale dynamical system using two equations which are:
\begin{align}
\frac{d}{dt} X_k
&= - X_{k-1} \left( X_{k-2} - X_{k+1} \right) - X_k + F - \left( \frac{hc}{b} \right) \sum_{j=0}^{J-1} Y_{j,k}
\\
\frac{d}{dt} Y_{j,k}
&= - cbY_{j+1,k} \left( Y_{j+2,k} - Y_{j-1,k} \right) - c Y_{j,k} + \frac{hc}{b} X_k
\end{align}


## Defining the GCM Classes

In all the GCMs, we set time stepping using the fourth order Runge-Kutta method.

### *Without* Neural Network Parameterization

#### No Parameterization

In [None]:
class GCM_without_parameterization:
    """GCM without parameterization

    Args:
        F: Forcing term
        time_stepping: Time stepping method
    """

    def __init__(self, F, time_stepping=RK4):
        self.F = F
        self.time_stepping = time_stepping

    def rhs(self, X, _):
        """Compute right hand side of the the GCM equations"""
        return L96_eq1_xdot(X, self.F)

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

#### Linear Parameterization in RHS of Equation for Tendency

In [None]:
class GCM_linear_parameterization:
    """GCM with linear parameterization

    Args:
        F: Forcing term
        parameterization: Parameterization function
        time_stepping: Time stepping method
    """

    def __init__(self, F, parameterization, time_stepping=RK4):
        self.F = F
        self.parameterization = parameterization
        self.time_stepping = time_stepping

    def rhs(self, X, param):
        """Compute right hand side of the the GCM equations"""
        return L96_eq1_xdot(X, self.F) - self.parameterization(param, X)

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

### *With* Neural Network Parameterization

In [None]:
class GCM_network:
    """GCM with neural network parameterization

    Args:
        F: Forcing term
        network: Neural network
        time_stepping: Time stepping method
    """

    def __init__(self, F, network, time_stepping=RK4):
        self.F = F
        self.network = network
        self.time_stepping = time_stepping

    def rhs(self, X, _):
        """Compute right hand side of the the GCM equations"""
        if self.network.linear1.in_features == 1:
            X_torch = torch.from_numpy(X)
            X_torch = torch.unsqueeze(X_torch, 1)
        else:
            X_torch = torch.from_numpy(np.expand_dims(X, 0))

        # Adding NN parameterization
        return L96_eq1_xdot(X, self.F) + np.squeeze(self.network(X_torch).data.numpy())

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

### Build the *Real World* to Generate the Ground Truth Dataset

We initialise the L96 two time-scale model using $K$ (set to 8) values of $X$ and $J$ (set to 32) values of $Y$ for each $X$. The model is run for 20,000 timesteps to generate the dataset for the neural network.

In [None]:
time_steps = 20000
forcing, dt, T = 18, 0.01, 0.01 * time_steps

# Create a "real world" with K=8 and J=32
W = L96(8, 32, F=forcing)

### Getting Training Data

Using the *real world* model created above we generate the training data (input and output pairs) for the neural network by running the true state and outputting subgrid tendencies.

In [None]:
# The effect of Y on X is `xy_true`
X_true, _, _, xy_true = W.run(dt, T, store=True, return_coupling=True)

# Change the data type to `float32` in order to avoid doing type conversions later on
X_true, xy_true = X_true.astype(np.float32), xy_true.astype(np.float32)

### Split the Data to obtain the Training and Test (Validation) Set

In [None]:
# Number of time steps for validation
val_size = 4000

# Training Data
X_true_train = X_true[
    :-val_size, :
]  # Flatten because we first use single input as a sample
subgrid_tend_train = xy_true[:-val_size, :]

# Test Data
X_true_test = X_true[-val_size:, :]
subgrid_tend_test = xy_true[-val_size:, :]

### Create Data Loaders 

- `Dataset` and `Dataloader` classes provide a very convenient way of iterating over a dataset while training a deep learning model.

- We need to iterate over the data because it is very slow and memory-intensive to hold all the data and to use gradient decent over all the data simultaneously (see more details [here](https://machinelearningmastery.com/gentle-introduction-mini-batch-gradient-descent-configure-batch-size/) and [here](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html)).

In [None]:
# Number of sample in each batch
BATCH_SIZE = 1024

Define the X (state), Y (subgrid tendency) pairs for the linear regression local network.

In [None]:
local_dataset = Data.TensorDataset(
    torch.from_numpy(np.reshape(X_true_train, -1)),
    torch.from_numpy(np.reshape(subgrid_tend_train, -1)),
)

local_loader = Data.DataLoader(
    dataset=local_dataset, batch_size=BATCH_SIZE, shuffle=True
)

Define the dataloader for the test set.

In [None]:
local_dataset_test = Data.TensorDataset(
    torch.from_numpy(np.reshape(X_true_test, -1)),
    torch.from_numpy(np.reshape(subgrid_tend_test, -1)),
)

local_loader_test = Data.DataLoader(
    dataset=local_dataset_test, batch_size=BATCH_SIZE, shuffle=True
)

Display a batch of samples from the dataset.

In [None]:
# Iterating over the data to get one batch
data_iterator = iter(local_loader)
X_iter, subgrid_tend_iter = next(data_iterator)

print("X (State):\n", X_iter)
print("\nY (Subgrid Tendency):\n", subgrid_tend_iter)

plt.figure(dpi=150)
plt.plot(X_iter, subgrid_tend_iter, ".")
plt.xlabel("State", fontsize=20)
plt.ylabel("Subgrid tendency", fontsize=20);

## Building Neural Networks in PyTorch

The **Universal Approximation Theorm** states that neural networks can approximate any continuous function. A visual demonstration that neural nets can compute any function can be seen in [this page](http://neuralnetworksanddeeplearning.com/chap4.html).

In this notebook, we give a brief overview of neural networks and how to build them using PyTorch. If you want to go through it in depth, check out these resources:
- [Deep Learning With Pytorch: A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html)
- [Neural Networks](https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)

### Neural Network Architectures

We will try to understand the fully connected networks with the help of Linear regression (and gradient descent).

<center>
  <img
    src="https://miro.medium.com/max/720/1*VHOUViL8dHGfvxCsswPv-Q.png"
    width=400
  />
</center>

### Building a Linear Regression Network

First, we will build a linear regression "network" and later see how to generalize the linear regression in order to use fully connected neural networks.

In [None]:
class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(1, 1)  # A single input and a single output

    def forward(self, x):
        # This method is automatically executed when
        # we call a object of this class
        x = self.linear1(x)
        return x

In [None]:
linear_network = LinearRegression()
linear_network

### Obtaining Predictions from the Network

In [None]:
net_input = torch.randn(1, 1)
out = linear_network(net_input)
print(f"The output of the random input is: {out.item():.4f}")

### Defining the Loss Function

In order to check how well our network is modeling the dataset, we need to define a loss function. For our task, we choose the *Mean Squared Error* metric as our loss function.

In [None]:
# MSE loss function
criterion = torch.nn.MSELoss()

# Load the input and output pair from the data loader
X_tmp = next(iter(local_loader))

# Predict the output
y_tmp = linear_network(torch.unsqueeze(X_tmp[0], 1))

# Calculate the MSE loss
loss = criterion(y_tmp, torch.unsqueeze(X_tmp[1], 1))
print(f"MSE Loss: {loss.item():.4f}")

### Calculating gradients

In [None]:
# Zero the gradient buffers of all parameters
linear_network.zero_grad()

print("Gradients before backward:")
print(linear_network.linear1.bias.grad)

# Compute the gradients
loss.backward(retain_graph=True)

print("\nGradients after backward:")
print(linear_network.linear1.bias.grad)

### Updating the Weights using an Optimizer

Now in order to make the network learn, we need an algorithm that will update its weights depending on the loss function. This is achieved by using an optimizer. The implementation of almost every optimizer that we'll ever need can be found in PyTorch itself. The choice of which optimizer we choose might be very important as it will determine how fast the network will be able to learn.

In the example below, we show one of the popular optimizers `SGD`.

In [None]:
optimizer = optim.SGD(linear_network.parameters(), lr=0.003, momentum=0.9)
print("Before backward pass: \n", list(linear_network.parameters())[0].data.numpy())

loss.backward(retain_graph=True)
optimizer.step()

print("\nAfter backward pass: \n", list(linear_network.parameters())[0].data.numpy())

An optimizer usually consists of two major hyperparameters called the **learning rate** and **momentum**. The **learning rate** determines the magnitude with which the weights of the network update thus making it crucial to choose the correct learning rate ($LR$) otherwise the network will either fail to train, or take much longer to converge. To read about **momentum**, check out this [blog post](https://towardsdatascience.com/stochastic-gradient-descent-with-momentum-a84097641a5d).

The  effective value of the gradient $V$ at step $t$ in SGD with momentum ($\beta$) is determined by

\begin{equation}
V_t = \beta V_{t-1} + (1-\beta) \nabla_w L(W,X,y)
\end{equation}

and the updates to the weights will be

\begin{equation}
w^{new} = w^{old} - LR * V_t
\end{equation}

#### Adam Optimizer

Another popular optimizer that is used in many neural networks is the Adam optimizer. It is an adaptive learning rate method that computes individual learning rates for different parameters. For further reading, check out this [post](https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c) about Adam, and this [post](https://www.ruder.io/optimizing-gradient-descent/) about other optimizers.


## Combining it all Together: Training the Whole Network

### Define the Training and Test Functions

In [None]:
def train_model(network, criterion, loader, optimizer):
    """Train the network for one epoch"""
    network.train()

    train_loss = 0
    for batch_x, batch_y in loader:
        # Get predictions
        if len(batch_x.shape) == 1:
            # This if block is needed to add a dummy dimension if our inputs are 1D
            # (where each number is a different sample)
            prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
        else:
            prediction = network(batch_x)

        # Compute the loss
        loss = criterion(prediction, batch_y)
        train_loss += loss.item()

        # Clear the gradients
        optimizer.zero_grad()

        # Backpropagation to compute the gradients and update the weights
        loss.backward()
        optimizer.step()

    return train_loss / len(loader)

In [None]:
def test_model(network, criterion, loader):
    """Test the network"""
    network.eval()  # Evaluation mode (important when having dropout layers)

    test_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            # Get predictions
            if len(batch_x.shape) == 1:
                # This if block is needed to add a dummy dimension if our inputs are 1D
                # (where each number is a different sample)
                prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
            else:
                prediction = network(batch_x)

            # Compute the loss
            loss = criterion(prediction, batch_y)
            test_loss += loss.item()

        # Get an average loss for the entire dataset
        test_loss /= len(loader)

    return test_loss

In [None]:
def fit_model(network, criterion, optimizer, train_loader, val_loader, n_epochs):
    """Train and validate the network"""
    train_losses, val_losses = [], []
    for epoch in range(1, n_epochs + 1):
        print(f"Epoch {epoch}:")
        train_loss = train_model(network, criterion, train_loader, optimizer)
        val_loss = test_model(network, criterion, val_loader)
        print(f"Training Loss: {train_loss:.6f} | Validation Loss: {val_loss:.6f}\n")

        train_losses.append(train_loss)
        val_losses.append(val_loss)

    return train_losses, val_losses

### Set the Number of Epochs and the Optimizer

Epochs refer to the number of times we iterate over the entire training data during training.

In [None]:
n_epochs = 3
optimizer = optim.Adam(linear_network.parameters(), lr=0.03)

### Train the Network

In [None]:
_, _ = fit_model(
    linear_network, criterion, optimizer, local_loader, local_loader_test, n_epochs
)

### Show the Weights of the Trained Network

In [None]:
weights = np.array(
    [
        linear_network.linear1.weight.data.numpy()[0][0],
        linear_network.linear1.bias.data.numpy()[0],
    ]
)
print(weights)

### Compare Predictions with Ground Truth

In [None]:
predictions = linear_network(
    torch.unsqueeze(torch.from_numpy(np.reshape(X_true_test[:, 1], -1)), 1)
)
plt.figure(dpi=150)
plt.plot(predictions.detach().numpy()[0:1000], label="Predicted Values")
plt.plot(subgrid_tend_test[:1000, 1], label="True Values")
plt.legend(fontsize=7);

### Putting the Simple Linear Parameterization back to the GCM

In [None]:
T_test = 10

# Full L96 model
X_full, _, _ = W.run(dt, T_test)
X_full = X_full.astype(np.float32)

init_conditions = X_true[-1, :]

# GCM parameterized by the linear network
gcm_net = GCM_network(forcing, linear_network)
Xnn_1layer, t = gcm_net(init_conditions, dt, int(T_test / dt), linear_network)

# GCM parameterized without parameterization
gcm_no_param = GCM_without_parameterization(forcing)
X_no_param, t = gcm_no_param(init_conditions, dt, int(T_test / dt))

# GCM with naive parameterization
naive_parameterization = lambda param, X: np.polyval(param, X)
gcm = GCM_linear_parameterization(forcing, naive_parameterization)
X_param, t = gcm(init_conditions, dt, int(T / dt), param=-weights)

### Compare Results

In [None]:
time_i = 200
plt.figure(dpi=150)
plt.plot(t[:time_i], X_full[:time_i, 4], label="Full L96")
plt.plot(t[:time_i], Xnn_1layer[:time_i, 4], ".", label="NN 1 layer")
plt.plot(t[:time_i], X_no_param[:time_i, 4], label="No parameterization")
plt.plot(t[:time_i], X_param[:time_i, 4], label="linear parameterization")
plt.legend(loc="upper left", fontsize=7);

# Using Deeper Networks for Lorenz 96 (with Non-Local Features)

Now we'll increase the complexity of our neural network by adding a few more linear layers to it.

<center>
  <img
    src="https://www.researchgate.net/publication/319201436/figure/fig1/AS:869115023589376@1584224577926/Visualisation-of-a-two-scale-Lorenz-96-system-with-J-8-and-K-6-Global-scale-values.png"
    width=400
  />
</center>

<span> <center> *Fig. 1: Visualisation of a two-scale Lorenz '96 system with J = 8 and K = 6. Global-scale variables ($X_k$) are updated based on neighbouring variables and on the local-scale variables ($Y_{j,k}$) associated with the corresponding global-scale variable. Local-scale variabless are updated based on neighbouring variables and the associated global-scale variable. The neighbourhood topology of both local and global-scale variables is circular. Image from [Exploiting the chaotic behaviour of atmospheric models with reconfigurable architectures - Scientific Figure on ResearchGate.](https://www.researchgate.net/figure/Visualisation-of-a-two-scale-Lorenz-96-system-with-J-8-and-K-6-Global-scale-values_fig1_319201436)* </center> </span>

## Create non-local Training and Test Dataset

The datasets will have *8 inputs* and *8 outputs*.

In [None]:
# Number of sample in each batch
BATCH_SIZE = 1024

Create the training dataset and data loader

In [None]:
nlocal_data = Data.TensorDataset(
    torch.from_numpy(X_true_train),
    torch.from_numpy(subgrid_tend_train),
)

loader = Data.DataLoader(dataset=nlocal_data, batch_size=BATCH_SIZE, shuffle=True)

Create test dataset and data loader

In [None]:
nlocal_data_test = Data.TensorDataset(
    torch.from_numpy(X_true_test), torch.from_numpy(subgrid_tend_test)
)

loader_test = Data.DataLoader(
    dataset=nlocal_data_test, batch_size=BATCH_SIZE, shuffle=True
)

## Creating a class of a 3 layer fully-connected network with ReLU

We now build a 3 layer neural network consisting of two hidden layers and an output layer. This time we use an activation function called `ReLU` (a very common choice) after every hidden layer.

In [None]:
class NetANN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(8, 16)  # 8 inputs
        self.linear2 = nn.Linear(16, 16)
        self.linear3 = nn.Linear(16, 8)  # 8 outputs

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x

### Need for Activation Functions

**If layers contain only matrix multiplications, everything would be linear.**

For example, if we have an input $x$ along with 2 layers of weight matrices $A$ and $B$ then the neural network would compute the output as $A(Bx)$, which is linear (in $x$). Thus, in order to introduce some non-linearity we use activation functions.

Now the same neural network as above with an activation function $\phi$ would compute the output as $A(\phi(Bx))$.


The `ReLU` activation function used in the `NetANN` network above is just a $max(0,X)$ function. Even a function as simple as this enables a typical NN to be a nonlinear function of the inputs!

In [None]:
# Plotting the curve of ReLU
x = np.linspace(-2, 2, 50)
plt.figure(dpi=150)
plt.plot(x, np.maximum(x, 0))
plt.title("ReLU", fontsize=20);

In [None]:
# Initializing the network
nn_3l = NetANN()

## Training the Network

Set the Number of Epochs and the Optimizer

In [None]:
n_epochs = 50
optimizer = optim.Adam(nn_3l.parameters(), lr=0.003)

Start training

In [None]:
train_loss, val_loss = fit_model(
    nn_3l, criterion, optimizer, loader, loader_test, n_epochs
)

## Visualizing Results

### Training and Validation Loss Curves

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

### Comparing Predictions with the Ground Truths

In [None]:
predictions = nn_3l(torch.from_numpy(X_true_test[:, :]))
plt.figure(dpi=150)
plt.plot(predictions.detach().numpy()[0:1000, 1], label="NN Predicted values")
plt.plot(subgrid_tend_test[:1000, 1], label="True values")
plt.legend(fontsize=7);

## Putting the Simple Linear Parameterization back to the GCM

In [None]:
T_test = 5

# GCM parameterized by the global 3-layer network
gcm_net_3layers = GCM_network(forcing, nn_3l)
Xnn_3layer, t = gcm_net_3layers(init_conditions, dt, int(T_test / dt), nn_3l)

# GCM parameterized by the linear network
gcm_net_1layers = GCM_network(forcing, linear_network)
Xnn_1layer, t = gcm_net_1layers(init_conditions, dt, int(T_test / dt), linear_network)

### Compare Results

In [None]:
time_i = 240
channel = 1
plt.figure(dpi=150)
plt.plot(t[:time_i], X_full[:time_i, channel], label="Full L96")
plt.plot(t[:time_i], Xnn_1layer[:time_i, channel], ".", label="NN 1 layer local")
plt.plot(t[:time_i], Xnn_3layer[:time_i, channel], ".", label="NN 3 layer global")
plt.legend(fontsize=7);

### Checking over 100 Different Initial Conditions

In [None]:
err_1l, err_3l = [], []
T_test = 1
for i in range(100):
    init_conditions_i = X_true[i * 10, :]

    # GCM parameterized by the global 3-layer network
    gcm_net_3layers = GCM_network(forcing, nn_3l)
    Xnn_3layer_i, t = gcm_net_3layers(init_conditions_i, dt, int(T_test / dt), nn_3l)

    # GCM parameterized by the linear network
    gcm_net_1layers = GCM_network(forcing, linear_network)
    Xnn_1layer_i, t = gcm_net_1layers(
        init_conditions_i, dt, int(T_test / dt), linear_network
    )

    err_1l.append(
        np.sum(np.abs(X_true[i * 10 : i * 10 + T_test * 100 + 1] - Xnn_1layer_i))
    )
    err_3l.append(
        np.sum(np.abs(X_true[i * 10 : i * 10 + T_test * 100 + 1] - Xnn_3layer_i))
    )

print(f"Sum of errors for 1 layer local: {sum(err_1l):.2f}")
print(f"Sum of errors for 3 layer global: {sum(err_3l):.2f}")

## Training further to improve performance

In [None]:
n_epochs = 100
train_loss, val_loss = fit_model(
    nn_3l, criterion, optimizer, loader, loader_test, n_epochs
)

### Plotting the Training and Validation Loss Curves

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

## Saving the Network

Let's save the weights of the trained network so that we don't have to train it again if we want to use it in future.

In [None]:
# Save network
save_path = "./networks/network_3_layers_100_epoches.pt"
torch.save(nn_3l.state_dict(), save_path)

# Regularization and Overfitting

One of the most common issues that happen while training a neural network is when the model memorizes the training dataset. It causes the model to perform very accurately on the training set but shows very poor performance on the validation set. This phenomenon is termed overfitting. One of the ways to prevent overfitting is to add regularization to our model as described below.

In [None]:
# The figure below is taken from Python Machine Learning book by Sebastian Raschka
Image(filename="figs/overfitting.png", width=700)

The curve on the far right of the plot above predicts perfectly on the given set, yet it's not the best choice. This is because if you were to gather some new data points, they most likely would not be on that curve. Instead, those new points would be closer to the curve in the middle graph since it generalizes better to the dataset.

All ML algorithms have some form of regularization.

## Regularization Intuition

Regularization can be thought of as **putting constraints on the model** to obtain better generalizability i.e. *avoiding remembering* the training data.

One of the ways to achieve this can be by adding a term to the loss function such that:
> Loss = Training Loss + Regularization

This puts a penalty for making the model more complex.

Very braodly speaking (just to gain intuition) - if we want to reduce the training loss (reduce bias) we should try using a more complex model (if we have enough data) and if we want to reduce overfitting (reduce variace) we should simplify or constraint the model (increase regularization).

## Regularization of Neural Networks

Some of the ways to add regularization in neural networks are

- Dropout (added in the definition of the network). 
- Early stopping
- Weight decay (added in the optimizer part - see `optim.Adam` in PyTorch)
- Data augmentation (usually for images)

### Weight decay (L2 norm)

Weight decay is usually defined as a term that’s added directly to the update rule.
Namely, to update a certain weight $w$ in the $i+1$ iteration, we would use a modified rule:

$w_{i+1} = w_{i} - \gamma ( \frac{\partial L}{\partial w} + A w_{i})$

In practice, this is almost identical to L2 regularization, though there is some difference (e.g., see [here](https://bbabenko.github.io/weight-decay/))

Weight decay is one of the parameters of the optimizer - see `torch.optim.SGD`

#### Using Weight Decay

Now we try to train our `NetANN` model again but this time by adding a weight decay to it.

In [None]:
nn_3l_decay = NetANN()

n_epochs = 10
optimizer = optim.Adam(nn_3l_decay.parameters(), lr=0.003, weight_decay=0.1)

In [None]:
train_loss, val_loss = fit_model(
    nn_3l_decay, criterion, optimizer, loader, loader_test, n_epochs
)

Plotting the training and validation loss curves

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

### Dropout

Dropout means randomly deactivating or temporarily removing some units from a layer of the network while training, along with all its incoming and outgoing connections. See more details [here](http://jmlr.org/papers/v15/srivastava14a.html).
It is usually the most useful regularization that we can do in fully connected layers.

In convolutional layers dropout makes less sense - see more discussion [here](https://www.kdnuggets.com/2018/09/dropout-convolutional-networks.html)


In [None]:
# Image taken from: http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf
Image(filename="figs/Dropout_layer.png", width=700)

In the network defined below, we add dropout to with a probability of 20% to each layer. This means that during each training step, random 20% of the units within each layer will be deactivated.

In [None]:
class NetANNDropout(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.linear1 = nn.Linear(8, 16)
        self.linear2 = nn.Linear(16, 16)
        self.linear3 = nn.Linear(16, 8)

        self.dropout = nn.Dropout(dropout_rate)  # Dropout regularization
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.linear3(x)
        return x

In [None]:
# Network with very high dropout
nn_3l_drop = NetANNDropout(dropout_rate=0.8)

n_epochs = 10
optimizer = optim.Adam(nn_3l_drop.parameters(), lr=0.01)

In [None]:
train_loss, val_loss = fit_model(
    nn_3l_drop, criterion, optimizer, loader, loader_test, n_epochs
)

Plotting the training and validation loss curves.

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

# Choosing the Learning Rate

While training a neural network **selecting a good learning rate (LR) is essential for both fast convergence and a lower error**. A high learning rate can cause the training loss to never converge while a too small learning rate will cause the model to converge extremely slowly.

## Finding the Optimal Learning Rate

To choose the optimal learning rate for our network, we can use an LR finding algorithm. The objective of a LR Finder is to find the highest LR which still minimises the loss and does not make the loss diverge/explode. This is done by first starting with an extremely small LR and then increasing the LR after each batch until the corresponding loss starts to explode. To read more about learning rate finders, read [this blog](https://towardsdatascience.com/speeding-up-neural-net-training-with-lr-finder-c3b401a116d0).

For our use case, we use the LR finder from the `torch-lr-finder` package to find the best learning rate for our neural network.

In [None]:
from torch_lr_finder import LRFinder

Define the model and the optimizer. The optimizer is **initialized with a very small learning rate**.

In [None]:
nn_3l_lr = NetANN()
optimizer = optim.Adam(nn_3l_lr.parameters(), lr=1e-7)

Now we setup the LR finder and make it run for 200 iterations during which the learning rate varies from 1e-7 to 100.

In [None]:
lr_finder = LRFinder(nn_3l_lr, optimizer, criterion)
lr_finder.range_test(loader, end_lr=100, num_iter=200)

Now we plot the LR vs the loss curve to find the best learning rate.

In [None]:
# Plot the lr vs the loss curve
lr_finder.plot()

# Reset the model and optimizer to their initial state
lr_finder.reset()

From the curve, we see that at the learning of approximately 0.01 we get the steepest gradient. So we choose 0.01 as the learning rate for our neural network.

In [None]:
n_epochs = 20
optimizer = optim.Adam(nn_3l_lr.parameters(), lr=0.01)

train_loss, val_loss = fit_model(
    nn_3l_lr, criterion, optimizer, loader, loader_test, n_epochs
)

Plotting the training and validation loss curves.

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

From the loss curves we can see that **the loss has converged much faster** than before.

# Recommended Reading

## BatchNormalization 

Normalize the activation values such that the hidden representation don’t vary drastically and also helps to get improvement in the training speed.

## Cyclic learning rate

The cyclic learning rate policy, introduced in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186), cycles the learning rate between two boundaries with a constant frequency in a triangular fashion. To read more about the cyclic learning rates and the one cycle policy, read [here](https://sgugger.github.io/the-1cycle-policy.html).

In PyTorch, cyclic learning rate can be used from `optim.lr_scheduler.CyclicLR`.

In [None]:
# Image taken from - https://pyimagesearch.com/2019/07/29/cyclical-learning-rates-with-keras-and-deep-learning/
Image(filename="figs/cyclic_lr.png", width=500)