In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import application_examples.helpers.training as train
import timeit

from torchdiffeq import odeint_adjoint
from torchdiffeq import odeint

In [2]:
device = torch.device('cuda:'+str(0) if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cpu


# Comparison between the use of the continuous adjoint method and the discrete-then-optimse method
In this notebook we will look at the difference in time when training a model with or without the adjoint method. We will use a simple system to illustrate this, keep in mind that this is not a general result.

Apart from the optimisation algorithm, all other parameters will remain the same.
## Hyperparameters

In [3]:
args = {
    'method': 'dopri5',
    'data_size': 5000,
    'batch_time': 20,
    'batch_size': 2500,
    'niters': 100000,
    'test_freq': 1000,
    'terminal_time': 25.,
    'learning_rate': 1e-4,
    'eps': 1e-2,
    'tol': 1
}

## Data and real system

In [4]:
t = torch.linspace(0., args['terminal_time'], args['data_size'])


In [5]:
true_y0 = torch.tensor([[2., 0.]])
true_A = torch.tensor([[-0.1, 3.0], [-3.0, -0.1]])

In [6]:
true_vector_field = lambda t, y: torch.mm(y, true_A)
with torch.no_grad():
    true_y = odeint(true_vector_field, true_y0, t, method='dopri5')

## Model

In [7]:
# Define a Neural Network to learn the vector field
class ODEnnR(nn.Module):

    def __init__(self):
        super(ODEnnR, self).__init__()

        self.net = nn.Sequential(
            # ODE-Net's neural network architecture: 1 hidden layer with 50 neurons, tanh activation, 2 output neurons
            nn.Linear(2, 50),
            nn.ReLU(),
            nn.Linear(50, 2),
        )

        # Initialize the weights and biases of the network for better convergence
        for m in self.net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0, std=0.1)
                nn.init.constant_(m.bias, val=0)

    # Define the forward pass of the network
    def forward(self, t, y):
        return self.net(y)
    
func = ODEnnR()

In [8]:
# Reset parameters
def reset_parameters(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

## Adjoint

In [9]:
optimizer = optim.RMSprop(func.parameters(), lr=args['learning_rate'])

# Time training
time = timeit.timeit(train.train_model(func, optimizer, true_y0, true_y, t, args, adjoint=True), number=10)
print("Training with adjoint has lasted {}s".format(time))

Iter 0000 | Total Loss 111.620453
Iter 1000 | Total Loss 104.259430
Iter 2000 | Total Loss 93.061569


KeyboardInterrupt: 

## No adjoint

In [12]:
optimizer = optim.RMSprop(func.parameters(), lr=args['learning_rate'])

reset_parameters(func)

# Time training
time = timeit.timeit(train.train_model(func, optimizer, true_y0, true_y, t, args, adjoint=False), number=10)
print("Training with adjoint has lasted {}s".format(time))

Iter 0000 | Total Loss 0.405955


KeyboardInterrupt: 