In [1]:
from src.seq2seq import *
from src.attention import *
from src.utils import *
from src.layers import MaskedCrossEntropyLoss
import torch 
import torch.optim as optim

# Setup

In [2]:
# OPTIONS: 
# ENGLISH - en, 
# GERMAN - de, 
# FRENCH - fr, 
# CZECH - cs

lang1 = 'de'
lang2 = 'en'

train_sentences, test_sentences = load_data(lang1, lang2)
train_sentences = (train_sentences[0][:3000], train_sentences[1][:3000])

In [3]:
TEST_SIZE=0.2
BATCH_SIZE=64
VALID_BATCH_SIZE=128
MAX_VOCAB=20000

src_vocab, tgt_vocab, train_loader, valid_loader = make_dataset(train_sentences, test_sentences, BATCH_SIZE, VALID_BATCH_SIZE, MAX_VOCAB)

In [4]:
print(f"Number of training examples: {len(train_loader.dataset)}")
print(f"Number of testing examples: {len(valid_loader.dataset)}")
print(f"Training Batches {len(train_loader)}\tValidation Batches {len(valid_loader)}")

Number of training examples: 3000
Number of testing examples: 1014
Training Batches 47	Validation Batches 8


In [5]:
print(f"Unique tokens in source ({lang1}) vocabulary: {len(src_vocab)}")
print(f"Unique tokens in target ({lang2}) vocabulary: {len(tgt_vocab)}")

Unique tokens in source (de) vocabulary: 4389
Unique tokens in target (en) vocabulary: 3446


# Make the Model

In [6]:
# ENCODER ARGS
ENC_UNITS = 128
ENC_EMBEDDING = 128
SRC_VOCAB_SIZE = len(src_vocab)
ENC_NUM_LAYERS = 1

# DECODER ARGS
DEC_UNITS = ENC_UNITS
DEC_EMBEDDING = ENC_EMBEDDING
TGT_VOCAB_SIZE = len(tgt_vocab)
DEC_NUM_LAYERS = ENC_NUM_LAYERS

# SEQ2SEQ ARGS
TEACHER_FORCING = 1.0
MAX_LENGTH = train_loader.dataset.tensors[1].size(-1) + 1
SOS_TOKEN = tgt_vocab.SOS_token

In [7]:
encoder = Encoder(ENC_UNITS, ENC_EMBEDDING, SRC_VOCAB_SIZE, ENC_NUM_LAYERS)
decoder = Decoder(DEC_UNITS, DEC_EMBEDDING, TGT_VOCAB_SIZE, DEC_NUM_LAYERS)

seq2seq = Seq2Seq(encoder, decoder, TEACHER_FORCING, MAX_LENGTH, SOS_TOKEN)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 1,645,558 trainable parameters


In [8]:
print(seq2seq)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4389, 128)
    (gru): GRU(128, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(3446, 128)
    (gru): GRU(128, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=3446, bias=True)
  )
)


In [9]:
criterion = MaskedCrossEntropyLoss(pad_tok=tgt_vocab.PAD_token)
optimizer = optim.Adam(seq2seq.parameters())

# Train

In [10]:
tgt_vocab.SOS_token, tgt_vocab.EOS_token

(3, 4)

In [13]:
valid_loss = evaluate(seq2seq, train_loader, criterion)

100%|██████████| 47/47 [00:05<00:00,  9.24it/s]


In [15]:
valid_loss

8.154829877488156

In [10]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(f'Epoch: {epoch+1:02}')
    
    train_loss = train(seq2seq, train_loader, optimizer, criterion, CLIP, src_vocab.PAD_token)
    valid_loss = evaluate(seq2seq, valid_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'models/seq2seq.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


Epoch: 01
100%|██████████| 47/47 [00:36<00:00,  1.31it/s]
100%|██████████| 8/8 [00:02<00:00,  3.87it/s]
	Train Loss: 6.132 | Train PPL: 460.402
	 Val. Loss: 5.972 |  Val. PPL: 392.134
Epoch: 02
100%|██████████| 47/47 [00:35<00:00,  1.32it/s]
100%|██████████| 8/8 [00:02<00:00,  3.88it/s]
	Train Loss: 4.750 | Train PPL: 115.632
	 Val. Loss: 5.763 |  Val. PPL: 318.387
Epoch: 03
100%|██████████| 47/47 [00:36<00:00,  1.31it/s]
100%|██████████| 8/8 [00:02<00:00,  3.87it/s]
	Train Loss: 4.431 | Train PPL:  84.056
	 Val. Loss: 5.998 |  Val. PPL: 402.788
Epoch: 04
100%|██████████| 47/47 [00:36<00:00,  1.31it/s]
100%|██████████| 8/8 [00:02<00:00,  3.65it/s]
	Train Loss: 4.181 | Train PPL:  65.452
	 Val. Loss: 6.017 |  Val. PPL: 410.359
Epoch: 05
100%|██████████| 47/47 [00:36<00:00,  1.32it/s]
100%|██████████| 8/8 [00:02<00:00,  3.81it/s]
	Train Loss: 3.983 | Train PPL:  53.652
	 Val. Loss: 6.176 |  Val. PPL: 480.838
Epoch: 06
100%|██████████| 47/47 [00:37<00:00,  1.28it/s]
100%|██████████| 8/8 [

# Translate

In [43]:
seq2seq.load_state_dict(torch.load('models/seq2seq.pt'))

<All keys matched successfully>

In [10]:
idx = 80

src_sentence = train_loader.dataset.tensors[0][idx:idx+1]
tgt_sentence = train_loader.dataset.tensors[1][idx:idx+1]

src_sentence = src_vocab.to_string(src_sentence, remove_special=True)[0]
tgt_sentence = tgt_vocab.to_string(tgt_sentence, remove_special=True)[0]

In [11]:
translation, attention = translate(src_sentence, seq2seq, src_vocab, tgt_vocab, src_vocab.PAD_token)

tensor([[   2,   17,   43,   32, 1766,  125,   67,   21,    7, 1767,   75,    4,
            3]])
tensor([[3445, 3299,  644,  369, 2197,  125, 1753,  166, 2735, 2706, 2551, 2706,
         2768, 2768,  422,  713,  713, 3022, 1201, 3293, 2665,  121, 2727,  997,
         2958,  240, 3225, 2200,   23, 3052, 1393,   28, 2872, 2138,  755, 2158,
         2388, 2715, 2715]])


In [12]:
print(f"> {src_sentence}")
print(f"= {tgt_sentence}")
print(f"< {translation}")

> zwei gro e lohfarbene hunde spielen an einem sandigen strand .
= two large tan dogs play along a sandy beach .
< snowdrift snaps station couch adjusting hair tae older college sparrow measured sparrow code code dancing practicing practicing waited biking hulk pointed holds slushy moving gardens sleeping conditioner slope young raise doors dog mop-pad 12 tied wakeboarding flea culture culture
