Load modules

In [1]:
import spacy
from torchtext.datasets import Multi30k
from torchtext.data import Field
import torch
from torch import nn
from torchtext.data import BucketIterator
import torch.optim as optim
import random
import time

Dataset


code source : https://tutorials.pytorch.kr/beginner/torchtext_translation_tutorial.html

In [2]:
!python -m spacy download en
!python -m spacy download de
SRC = Field(tokenize = "spacy",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

# Download dataset
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

# Use vocab with minimum frequency 2
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

# Construct Iterater
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 7.8MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907057 sha256=1fa4dfb4f172fc76bf8932af5909e3cbc0c9d8fdbb9c92f5d9e866df8fda5b0c
  Stored in directory: /tmp/pip-ephem-wheel-cache-oowh7v9w/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 648kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 165kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 167kB/s]


Look up dataset

In [3]:
for sample in train_data:
  print(f'SRC : {sample.src}')
  print(f'TRG : {sample.trg}')
  break

SRC : ['zwei', 'junge', 'weiße', 'männer', 'sind', 'i', 'm', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
TRG : ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [4]:
for batch in train_iterator:
  print(f'SRC shape : {batch.src.shape}')
  print(f'TRG shape : {batch.trg.shape}')
  break

SRC shape : torch.Size([32, 128])
TRG shape : torch.Size([31, 128])


Hyper paramter

In [5]:
# BATCH_SIZE = 128
N_EPOCH = 20
LR = 1e-4
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENCODER_EMBED_DIM = 256
DECODER_EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT_RATIO = 0.5
DEC_DROPOUT_RATIO = 0.5
CLIP = 1

Encoder

In [6]:
class Encoder(nn.Module):
  def __init__(self,in_dim, emb_dim, hid_dim, n_layers, drop_rate):
    super(Encoder, self).__init__()
    # arguments
    self.in_dim = in_dim
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    self.drop_rate = drop_rate
    # layers
    self.emb = nn.Embedding(self.in_dim, self.emb_dim)
    self.lstm = nn.LSTM(input_size = emb_dim,
                        hidden_size = hid_dim,
                        num_layers = n_layers,
                        batch_first = False,
                        dropout = self.drop_rate)
    self.dropout = nn.Dropout(self.drop_rate)

  def forward(self, src):    
    embedded = self.dropout(self.emb(src))    
    outputs, (hidden, cell) = self.lstm(embedded)    # lstm retunrs not only hidden state but also cell state
    # outputs shpae : [num words, batch size, hidden dim]
    # hidden state shape : [n layers, batch size, hidden dim]
    # cell state shape: [n layers, batch size, hidden dim]
    return hidden, cell    # context vector

Decoder

In [7]:
class Decoder(nn.Module):
  def __init__(self, out_dim, emb_dim, hid_dim, n_layers, drop_rate):
    super(Decoder, self).__init__()
    # arguments
    self.out_dim = out_dim
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    self.drop_rate = drop_rate

    # layers
    self.emb = nn.Embedding(self.out_dim, self.emb_dim)
    self.lstm = nn.LSTM(input_size = self.emb_dim,
                        hidden_size = self.hid_dim,
                        num_layers = self.n_layers,
                        batch_first = False,
                        dropout = self.drop_rate)
    self.dense = nn.Linear(self.hid_dim, self.out_dim)
    self.dropout = nn.Dropout(self.drop_rate)

  def forward(self, before_trg, hidden, cell):    # before_trg shape : [batch size]
    before_trg = before_trg.unsqueeze(0)    # before_trg : [1, batch size]
    embedded = self.dropout(self.emb(before_trg))
    output, (hidden, cell) = self.lstm(embedded, (hidden, cell)) 
    prediction = self.dense(output.squeeze(0))   
    return prediction, hidden, cell

Seq2Seq

In [8]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
   
  def forward(self, src, trg,teacher_forcing_ratio = 0.5):
      hidden, cell = self.encoder(src)
      trg_len = trg.shape[0]
      batch_size = trg.shape[1]
      trg_vocab_size = self.decoder.out_dim
      outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
      input_ = trg[0, :]
      for t in range(1, trg_len):
        output, hidden, cell = self.decoder(input_, hidden, cell)
        outputs[t] = output 
        top1 = output.argmax(1)
        teacher_force = random.random() < teacher_forcing_ratio
        input_ = trg[t] if teacher_force else top1
      return outputs

In [10]:
def init_uniform_weights(m, lower_bound = -0.08, upper_bound = 0.08):
  for name, param in m.named_parameters():
    nn.init.uniform_(param.data, lower_bound, upper_bound)

encoder = Encoder(INPUT_DIM, ENCODER_EMBED_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT_RATIO)
decoder = Decoder(OUTPUT_DIM, DECODER_EMBED_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT_RATIO)
model = Seq2Seq(encoder, decoder, device).to(device)
model.apply(init_uniform_weights)

Seq2Seq(
  (encoder): Encoder(
    (emb): Embedding(7873, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dense): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

Trining Loop

In [11]:
class training:
  def __init__(self, SRC, TRG, encoder, decoder, model, lr):
    self.SRC = SRC
    self.TRG = TRG
    self.encoder = encoder
    self.decoder = decoder
    self.model = model
    self.lr = lr
    
    # Loss function - CE loss
    self.criterion =  nn.CrossEntropyLoss(ignore_index = self.TRG.vocab.stoi[self.TRG.pad_token])
    # optimizer - Adam
    self.opt = optim.Adam(self.model.parameters(), lr = self.lr)

  def train(self, iterator, clip, n_epoch = 10):
    self.model.train()
    start_time = time.time()
    for epoch in range(n_epoch):
      for idx, batch in enumerate(iterator):        
        # initilize
        src, trg = batch.src, batch.trg
        self.opt.zero_grad()
        # get output, trg
        output = self.model(src, trg)
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        # caculate loss
        loss = self.criterion(output, trg)
        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip)
        # update
        self.opt.step()
        # print log        
        end_time = time.time()
        print(f'==== {epoch+1}/{n_epoch} epoch  {idx+1}-th batch =====')
        print(f'loss : {loss.item()}  time passed : {end_time - start_time}')

In [12]:
train_func = training(SRC, TRG, encoder, decoder, model, LR)
train_func.train(iterator = train_iterator, clip = CLIP, n_epoch = 10)

==== 1/10 epoch  1-th batch =====
loss : 8.680526733398438  time passed : 0.3377058506011963
==== 1/10 epoch  2-th batch =====
loss : 8.669748306274414  time passed : 0.5310235023498535
==== 1/10 epoch  3-th batch =====
loss : 8.656637191772461  time passed : 0.6668870449066162
==== 1/10 epoch  4-th batch =====
loss : 8.64591121673584  time passed : 0.8214325904846191
==== 1/10 epoch  5-th batch =====
loss : 8.638562202453613  time passed : 0.979020357131958
==== 1/10 epoch  6-th batch =====
loss : 8.617661476135254  time passed : 1.1389868259429932
==== 1/10 epoch  7-th batch =====
loss : 8.60334300994873  time passed : 1.3093860149383545
==== 1/10 epoch  8-th batch =====
loss : 8.594685554504395  time passed : 1.4627079963684082
==== 1/10 epoch  9-th batch =====
loss : 8.57580280303955  time passed : 1.597916603088379
==== 1/10 epoch  10-th batch =====
loss : 8.561692237854004  time passed : 1.7397375106811523
==== 1/10 epoch  11-th batch =====
loss : 8.541417121887207  time passed :