# Using Neural Networks for L96 Parameterization

In this notebook, we'll extend upon the concepts learned in the [previous](https://m2lines.github.io/L96_demo/notebooks/Universal_approximation.html) notebook by using deep neural networks for Lorenz 96 parameterization.

In [None]:
%matplotlib inline
import time

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader

from L96_model import L96, RK4, L96_eq1_xdot

In [None]:
# Ensuring reproducibility
np.random.seed(14)
torch.manual_seed(14);

## Setting up the Dataset, Network, and the Training Code

First, we setup all the necessary code that's required to build the dataset, create our linear network and train the network on the dataset.

```{note}
The dataset, the linear network, and the training and evaluation functions that we use in this notebook are same as the one defined in [Introduction to Neural Networks](https://m2lines.github.io/L96_demo/notebooks/Universal_approximation.html).
```

### Create the Dataset

In [None]:
# Generating the Ground Truth
# ---------------------------

time_steps = 20000
forcing, dt, T = 18, 0.01, 0.01 * time_steps

W = L96(8, 32, F=forcing)

X_true, _, _, xy_true = W.run(dt, T, store=True, return_coupling=True)
X_true, xy_true = X_true.astype(np.float32), xy_true.astype(np.float32)


# Splitting the into Training and Test Dataset
# --------------------------------------------

val_size = 4000

# Training Data
X_true_train = X_true[:-val_size, :]
subgrid_tend_train = xy_true[:-val_size, :]

# Test Data
X_true_test = X_true[-val_size:, :]
subgrid_tend_test = xy_true[-val_size:, :]


# Building the Dataset and the Dataloaders
# ----------------------------------------

BATCH_SIZE = 1024

# Training Data
local_data_train = TensorDataset(
    torch.from_numpy(np.reshape(X_true_train, -1)),
    torch.from_numpy(np.reshape(subgrid_tend_train, -1)),
)
local_loader_train = DataLoader(
    dataset=local_data_train, batch_size=BATCH_SIZE, shuffle=True
)


# Test Data
local_data_test = TensorDataset(
    torch.from_numpy(np.reshape(X_true_test, -1)),
    torch.from_numpy(np.reshape(subgrid_tend_test, -1)),
)
local_loader_test = DataLoader(
    dataset=local_data_test, batch_size=BATCH_SIZE, shuffle=True
)

### Define Functions to Train and Evaluate Neural Networks

In [None]:
def train_model(network, criterion, loader, optimizer):
    """Train the network for one epoch"""
    network.train()

    train_loss = 0
    for batch_x, batch_y in loader:
        # Get predictions
        if len(batch_x.shape) == 1:
            # This if block is needed to add a dummy dimension if our inputs are 1D
            # (where each number is a different sample)
            prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
        else:
            prediction = network(batch_x)

        # Compute the loss
        loss = criterion(prediction, batch_y)
        train_loss += loss.item()

        # Clear the gradients
        optimizer.zero_grad()

        # Backpropagation to compute the gradients and update the weights
        loss.backward()
        optimizer.step()

    return train_loss / len(loader)


def test_model(network, criterion, loader):
    """Test the network"""
    network.eval()  # Evaluation mode (important when having dropout layers)

    test_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            # Get predictions
            if len(batch_x.shape) == 1:
                # This if block is needed to add a dummy dimension if our inputs are 1D
                # (where each number is a different sample)
                prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
            else:
                prediction = network(batch_x)

            # Compute the loss
            loss = criterion(prediction, batch_y)
            test_loss += loss.item()

        # Get an average loss for the entire dataset
        test_loss /= len(loader)

    return test_loss


def fit_model(network, criterion, optimizer, train_loader, val_loader, n_epochs):
    """Train and validate the network"""
    train_losses, val_losses = [], []
    start_time = time.time()
    for epoch in range(1, n_epochs + 1):
        train_loss = train_model(network, criterion, train_loader, optimizer)
        val_loss = test_model(network, criterion, val_loader)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
    end_time = time.time()
    print(f"Training completed in {int(end_time - start_time)} seconds.")

    return train_losses, val_losses

### Create and Train a Linear Network

In [None]:
# Define the Network Class
# ------------------------
class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(1, 1)  # A single input and a single output

    def forward(self, x):
        # This method is automatically executed when
        # we call a object of this class
        x = self.linear1(x)
        return x


# Initialize the network
# ----------------------
linear_network = LinearRegression()


# Train the linear model
# ----------------------
n_epochs = 3
criterion = nn.MSELoss()
optimizer = optim.Adam(linear_network.parameters(), lr=0.03)
_, _ = fit_model(
    linear_network,
    criterion,
    optimizer,
    local_loader_train,
    local_loader_test,
    n_epochs,
)


# Get the weights of the model
# ----------------------------
linear_network_weights = np.array(
    [
        linear_network.linear1.weight.data.numpy()[0][0],
        linear_network.linear1.bias.data.numpy()[0],
    ]
)

## Adding Simple Linear Parameterization to GCM

In all the GCMs, we set time stepping using the fourth order Runge-Kutta method.

To recap, {cite}`Lorenz1995` describes a two-time scale dynamical system using two equations which are:

\begin{gather*}
\frac{d}{dt} X_k
&= - X_{k-1} \left( X_{k-2} - X_{k+1} \right) - X_k + F - \left( \frac{hc}{b} \right) \sum_{j=0}^{J-1} Y_{j,k}
\end{gather*}

\begin{gather*}
\frac{d}{dt} Y_{j,k}
&= - cbY_{j+1,k} \left( Y_{j+2,k} - Y_{j-1,k} \right) - c Y_{j,k} + \frac{hc}{b} X_k
\end{gather*}

```{note}
All the GCM networks used in this notebook have been introduced earlier in notebooks [Key aspects of GCMs parameterizations](https://m2lines.github.io/L96_demo/notebooks/gcm-parameterization-problem.html) and [Tuning GCM Parameterizations](https://m2lines.github.io/L96_demo/notebooks/estimating-gcm-parameters.html). To see the definition of those networks, expand the cells in the respective GCM sections below.
```

In [None]:
T_test = 10

# Full L96 model
X_full, _, _ = W.run(dt, T_test)
X_full = X_full.astype(np.float32)

init_conditions = X_true[-1, :]

### GCM *Without* Neural Network Parameterization

In [None]:
class GCM_without_parameterization:
    """GCM without parameterization

    Args:
        F: Forcing term
        time_stepping: Time stepping method
    """

    def __init__(self, F, time_stepping=RK4):
        self.F = F
        self.time_stepping = time_stepping

    def rhs(self, X, _):
        """Compute right hand side of the the GCM equations"""
        return L96_eq1_xdot(X, self.F)

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

In [None]:
gcm_no_param = GCM_without_parameterization(forcing)
X_no_param, t = gcm_no_param(init_conditions, dt, int(T_test / dt))

### GCM Linear Parameterization in RHS of Equation for Tendency

In [None]:
class GCM_linear_parameterization:
    """GCM with linear parameterization

    Args:
        F: Forcing term
        parameterization: Parameterization function
        time_stepping: Time stepping method
    """

    def __init__(self, F, parameterization, time_stepping=RK4):
        self.F = F
        self.parameterization = parameterization
        self.time_stepping = time_stepping

    def rhs(self, X, param):
        """Compute right hand side of the the GCM equations"""
        return L96_eq1_xdot(X, self.F) - self.parameterization(param, X)

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

In [None]:
naive_parameterization = lambda param, X: np.polyval(param, X)
gcm = GCM_linear_parameterization(forcing, naive_parameterization)
X_param, t = gcm(init_conditions, dt, int(T / dt), param=-linear_network_weights)

### GCM *With* Neural Network Parameterization

In [None]:
class GCM_network:
    """GCM with neural network parameterization

    Args:
        F: Forcing term
        network: Neural network
        time_stepping: Time stepping method
    """

    def __init__(self, F, network, time_stepping=RK4):
        self.F = F
        self.network = network
        self.time_stepping = time_stepping

    def rhs(self, X, _):
        """Compute right hand side of the the GCM equations"""
        if self.network.linear1.in_features == 1:
            X_torch = torch.from_numpy(X)
            X_torch = torch.unsqueeze(X_torch, 1)
        else:
            X_torch = torch.from_numpy(np.expand_dims(X, 0))

        # Adding NN parameterization
        return L96_eq1_xdot(X, self.F) + np.squeeze(self.network(X_torch).data.numpy())

    def __call__(self, X0, dt, nt, param=[0]):
        """Run GCM

        Args:
            X0: Initial conditions of X
            dt: Time increment
            nt: Number of forward steps to take
            param: Parameters of closure

        Returns:
            Model output for all variables of X at each timestep
            along with the corresponding time units
        """
        time, hist, X = (
            dt * np.arange(nt + 1),
            np.zeros((nt + 1, len(X0))) * np.nan,
            X0.copy(),
        )
        hist[0] = X

        for n in range(nt):
            X = self.time_stepping(self.rhs, dt, X, param)
            hist[n + 1], time[n + 1] = X, dt * (n + 1)
        return hist, time

In [None]:
gcm_net = GCM_network(forcing, linear_network)
Xnn_1layer, t = gcm_net(init_conditions, dt, int(T_test / dt), linear_network)

### Comparing Results

Comparing the predictions of GCM with different parameterizations.

In [None]:
time_i = 200
plt.figure(dpi=150)
plt.plot(t[:time_i], X_full[:time_i, 4], label="Full L96")
plt.plot(t[:time_i], Xnn_1layer[:time_i, 4], ".", label="NN 1 layer")
plt.plot(t[:time_i], X_no_param[:time_i, 4], label="No parameterization")
plt.plot(t[:time_i], X_param[:time_i, 4], label="linear parameterization")
plt.legend(loc="upper left", fontsize=7);

## Using Deeper Networks for Lorenz 96 (with Non-Local Features)

Now we'll increase the complexity of our neural network by adding a few more linear layers to it.

### Create Non-Local Training and Test Dataset

We first start by generating the dataset which has *8 inputs* and *8 outputs*.

In [None]:
# Training Dataset
# ----------------
nlocal_data_train = TensorDataset(
    torch.from_numpy(X_true_train),
    torch.from_numpy(subgrid_tend_train),
)
loader_train = DataLoader(
    dataset=nlocal_data_train, batch_size=BATCH_SIZE, shuffle=True
)


# Test Dataset
# ------------
nlocal_data_test = TensorDataset(
    torch.from_numpy(X_true_test), torch.from_numpy(subgrid_tend_test)
)
loader_test = DataLoader(dataset=nlocal_data_test, batch_size=BATCH_SIZE, shuffle=True)

### Creating a 3 layer Neural Network with ReLU Activation

We now build a 3 layer neural network consisting of two hidden layers and an output layer. This time we use an activation function called `ReLU` (a very common choice) after every hidden layer.

In [None]:
class NetANN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(8, 16)  # 8 inputs
        self.linear2 = nn.Linear(16, 16)
        self.linear3 = nn.Linear(16, 8)  # 8 outputs

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x

```{admonition} Need for Activation Functions

**If layers of a neural network contain only fully-connected layers (matrix multiplications), everything would be linear.**

For example, if we have an input $x$ along with 2 layers of weight matrices $A$ and $B$ then the neural network would compute the output as $A(Bx)$, which is linear (in $x$). Thus, in order to introduce some non-linearity we use activation functions.

Now the same neural network as above with an activation function $\phi$ would compute the output as $A(\phi(Bx))$.
```

`````{admonition} ReLU Activation Function

```{figure} figs/relu_activation_function.png
:name: relu-activation
```

The `ReLU` activation function is just a $max(0,X)$ function (The image is referenced from [this blog](https://analyticsindiamag.com/most-common-activation-functions-in-neural-networks-and-rationale-behind-it/)). Even a function as simple as this enables a typical NN to be a nonlinear function of the inputs!
`````

### Training the 3-Layer Network

In [None]:
# Initalize the model
nn_3l = NetANN()

# Setup the optimizer and loss
n_epochs = 50
optimizer = optim.Adam(nn_3l.parameters(), lr=0.003)
criterion = torch.nn.MSELoss()

# Train the model
train_loss, val_loss = fit_model(
    nn_3l, criterion, optimizer, loader_train, loader_test, n_epochs
)

### Visualizing Results

#### Training and Validation Loss Curves

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

#### Comparing Predictions with the Ground Truths

In [None]:
predictions = nn_3l(torch.from_numpy(X_true_test[:, :]))
plt.figure(dpi=150)
plt.plot(predictions.detach().numpy()[0:1000, 1], label="NN Predicted values")
plt.plot(subgrid_tend_test[:1000, 1], label="True values")
plt.legend(fontsize=7);

## Adding Deep Neural Network Parameterization to GCM

In [None]:
T_test = 5

# GCM parameterized by the global 3-layer network
gcm_net_3layers = GCM_network(forcing, nn_3l)
Xnn_3layer, t = gcm_net_3layers(init_conditions, dt, int(T_test / dt), nn_3l)

### Comparing Results with Linear Network Parameterized GCM

In [None]:
# GCM parameterized by the linear network
gcm_net_1layers = GCM_network(forcing, linear_network)
Xnn_1layer, t = gcm_net_1layers(init_conditions, dt, int(T_test / dt), linear_network)

In [None]:
time_i = 240
channel = 1
plt.figure(dpi=150)
plt.plot(t[:time_i], X_full[:time_i, channel], label="Full L96")
plt.plot(t[:time_i], Xnn_1layer[:time_i, channel], ".", label="NN 1 layer local")
plt.plot(t[:time_i], Xnn_3layer[:time_i, channel], ".", label="NN 3 layer global")
plt.legend(fontsize=7);

### Checking over 100 Different Initial Conditions

In [None]:
err_1l, err_3l = [], []
T_test = 1
for i in range(100):
    init_conditions_i = X_true[i * 10, :]

    # GCM parameterized by the global 3-layer network
    gcm_net_3layers = GCM_network(forcing, nn_3l)
    Xnn_3layer_i, t = gcm_net_3layers(init_conditions_i, dt, int(T_test / dt), nn_3l)

    # GCM parameterized by the linear network
    gcm_net_1layers = GCM_network(forcing, linear_network)
    Xnn_1layer_i, t = gcm_net_1layers(
        init_conditions_i, dt, int(T_test / dt), linear_network
    )

    err_1l.append(
        np.sum(np.abs(X_true[i * 10 : i * 10 + T_test * 100 + 1] - Xnn_1layer_i))
    )
    err_3l.append(
        np.sum(np.abs(X_true[i * 10 : i * 10 + T_test * 100 + 1] - Xnn_3layer_i))
    )

print(f"Sum of errors for 1 layer local: {sum(err_1l):.2f}")
print(f"Sum of errors for 3 layer global: {sum(err_3l):.2f}")

## Training the Model Further to Improve Performance

In [None]:
n_epochs = 100
train_loss, val_loss = fit_model(
    nn_3l, criterion, optimizer, loader_train, loader_test, n_epochs
)

### Plotting the Training and Validation Loss Curves

In [None]:
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(val_loss, "r", label="Validation loss")
plt.legend();

### Saving the Network

Let's save the weights of the trained network so that we don't have to train it again if we want to use it in future.

In [None]:
# Save network
save_path = "./networks/network_3_layers_100_epoches.pt"
torch.save(nn_3l.state_dict(), save_path)