# 

# Sequence-to-Sequence Modeling With nn.Transformer and TorchText

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

In [28]:
import time
import math
import torch.nn as nn
import torch.nn.functional as F
import torchtext

from src.config import *
from src.models import TransformerModel
from src.train_model import train
from src.evaluate import evaluate

In [11]:
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)

train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [17]:
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [18]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [29]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model, train_data, optimizer, scheduler, criterion, epoch)
    val_loss = evaluate(model, val_data, criterion)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



| epoch   1 |   200/ 2981 batches | lr 5.00 | ms/batch 322.66 | loss  5.57 | ppl   261.80
| epoch   1 |   400/ 2981 batches | lr 5.00 | ms/batch 335.55 | loss  5.70 | ppl   298.53
| epoch   1 |   600/ 2981 batches | lr 5.00 | ms/batch 329.63 | loss  5.54 | ppl   255.08
| epoch   1 |   800/ 2981 batches | lr 5.00 | ms/batch 316.98 | loss  5.60 | ppl   269.50
| epoch   1 |  1000/ 2981 batches | lr 5.00 | ms/batch 318.27 | loss  5.56 | ppl   258.75
| epoch   1 |  1200/ 2981 batches | lr 5.00 | ms/batch 322.79 | loss  5.60 | ppl   270.56
| epoch   1 |  1400/ 2981 batches | lr 5.00 | ms/batch 318.16 | loss  5.62 | ppl   274.99
| epoch   1 |  1600/ 2981 batches | lr 5.00 | ms/batch 319.96 | loss  5.65 | ppl   283.90
| epoch   1 |  1800/ 2981 batches | lr 5.00 | ms/batch 320.28 | loss  5.57 | ppl   263.35
| epoch   1 |  2000/ 2981 batches | lr 5.00 | ms/batch 322.99 | loss  5.61 | ppl   272.08
| epoch   1 |  2200/ 2981 batches | lr 5.00 | ms/batch 349.55 | loss  5.50 | ppl   244.37
| epoch   

## Evaluate the model with the test dataset

In [30]:
test_loss = evaluate(best_model, test_data, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  5.41 | test ppl   224.63
