In [9]:
import sys
sys.path.append("../")

import numpy as np
import torch
import torch.utils.data
import torch.nn as nn
import torch.optim as optim

import torchdiffeq

from tensorboard_utils import Tensorboard
from tensorboard_utils import tensorboard_event_accumulator

import transformer.Constants as Constants
from transformer.Layers import EncoderLayer, DecoderLayer
from transformer.Modules import ScaledDotProductAttention
from transformer.Models import Decoder, get_attn_key_pad_mask, get_non_pad_mask, get_sinusoid_encoding_table
from transformer.SubLayers import PositionwiseFeedForward

import dataset

import model_process
import checkpoints
from node_transformer import NodeTransformer

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib notebook  
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print("Torch Version", torch.__version__)

%load_ext autoreload
%autoreload 2

Torch Version 1.1.0
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
seed = 1
torch.manual_seed(seed)
device = torch.device("cuda")
print("device", device)

device cuda


In [11]:
data = torch.load("/home/mandubian/datasets/multi30k/multi30k.atok.low.pt")

In [12]:
max_token_seq_len = data['settings'].max_token_seq_len
print(max_token_seq_len)

52


In [13]:
train_loader, val_loader = dataset.prepare_dataloaders(data, batch_size=128)

### Create an experiment with a name and a unique ID

In [6]:
exp_name = "node_transformer_dopri5_multi30k"
unique_id = "2019-06-15_0830"

# unique_id = "2019-06-10_1300"
# node-decoder only
# d_word_vec=128, d_model=128, d_inner=512,
# n_head=4, method='dopri5-ext', rtol=1e-2, atol=1e-2,
# batch 128
# rtol=1e-2, atol=1e-2
# lr=1e-5
# dopri5 6 (0-10) puis 12

# unique_id = "2019-06-11_0000"
# node-decoder only
# d_word_vec=256, d_model=256, d_inner=1024,
# n_head=4, method='dopri5-ext', rtol=1e-2, atol=1e-2,
# batch 128
# rtol=1e-2, atol=1e-2
# lr=1e-5
# dopri5 2 then 10

# unique_id = "2019-06-12_2300"
# node-decoder only

#unique_id = "2019-06-15_0100"
# node-encoder + node-decoder
# catastrophic forgetting
# d_word_vec=256, d_model=256, d_inner=1024,
# n_head=4, method='dopri5-ext', rtol=1e-2, atol=1e-2,
# Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-9)


### Create Model

In [7]:
model = None

In [8]:
from odeint_ext_adams import *

src_vocab_sz = train_loader.dataset.src_vocab_size
print("src_vocab_sz", src_vocab_sz)
tgt_vocab_sz = train_loader.dataset.tgt_vocab_size
print("tgt_vocab_sz", tgt_vocab_sz)

if model:
    del model

model = NodeTransformer(
    n_src_vocab=max(src_vocab_sz, tgt_vocab_sz),
    n_tgt_vocab=max(src_vocab_sz, tgt_vocab_sz),
    len_max_seq=max_token_seq_len,
    #emb_src_tgt_weight_sharing=False,
    #d_word_vec=256, d_model=256, d_inner=1024,
    n_head=8, method='dopri5-ext', rtol=1e-2, atol=1e-2,
    has_node_encoder=True, has_node_decoder=True)

model = model.to(device)

src_vocab_sz 9795
tgt_vocab_sz 17989


### Create Tensorboard metrics logger

In [40]:
tb = Tensorboard(exp_name, unique_name=unique_id, output_dir="../runs")

Writing TensorBoard events locally to ../runs/node_transformer_dopri5_multi30k_2019-06-15_0830


### Create basic optimizer

In [None]:
#optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.995), eps=1e-9)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


### Train

In [None]:
# Continuous space discretization
timesteps = np.linspace(0., 1, num=2)
timesteps = torch.from_numpy(timesteps).float()

EPOCHS = 100
LOG_INTERVAL = 5

#from torch import autograd
#with autograd.detect_anomaly():
model_process.train(
    exp_name, unique_id,
    model, 
    train_loader, val_loader, timesteps,
    optimizer, device,
    epochs=EPOCHS, tb=tb, log_interval=LOG_INTERVAL,
    start_epoch=26, best_valid_accu=state["acc"]
)

In [None]:
model.decoder.decoder.rtol = 1e-3
model.decoder.decoder.atol = 1e-3

In [None]:
state = checkpoints.restore_best_checkpoint(
    exp_name, unique_id, "validation", model, optimizer)

print("accuracy", state["acc"])
print("loss", state["loss"])
model = model.to(device)

In [None]:
# Continuous space discretization
timesteps = np.linspace(0., 1, num=2)
timesteps = torch.from_numpy(timesteps).float()

EPOCHS = 100
LOG_INTERVAL = 5

#from torch import autograd
#with autograd.detect_anomaly():
model_process.train(
    exp_name, unique_id,
    model, 
    train_loader, val_loader, timesteps,
    optimizer, device,
    epochs=EPOCHS, tb=tb, log_interval=LOG_INTERVAL,
    start_epoch=51, best_valid_accu=state["acc"]
)

### Restore best checkpoint (to restart past training)