# **Installation**

In this section, we install a package pyvi which is the famous library in NLP for Vietnamese. 


We also install a vi_spacy that is Vietnamese language model for spacy.io. (Author: Viet-Trung Tran)

In [None]:
!pip -q install torchtext==0.6.0
!pip -q install pyvi 
!pip -q install https://github.com/trungtv/vi_spacy/raw/master/packages/vi_spacy_model-0.2.1/dist/vi_spacy_model-0.2.1.tar.gz
!python -m spacy link vi_spacy_model vi_spacy_model

[K     |████████████████████████████████| 71kB 5.4MB/s 
[K     |████████████████████████████████| 1.2MB 15.2MB/s 
[K     |████████████████████████████████| 8.5MB 6.3MB/s 
[K     |████████████████████████████████| 747kB 43.8MB/s 
[K     |████████████████████████████████| 2.7MB 7.6MB/s 
[?25h  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 42.3MB 72kB/s 
[?25h  Building wheel for vi-spacy-model (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/vi_spacy_model -->
/usr/local/lib/python3.7/dist-packages/spacy/data/vi_spacy_model
You can now load the model via spacy.load('vi_spacy_model')


In [None]:
import os
import nltk
import re
nltk.download('wordnet')
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import math
import requests
import tarfile
import copy
import spacy
import dill as pickle
import torch
import torch.nn as nn
from torchtext import data
from torchtext.data.metrics import bleu_score
from torch.autograd import Variable
import torch.nn.functional as F

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


#**Dataset** 

In [None]:

url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/'

def iwslt15(train_test):
  url = 'https://github.com/stefan-it/nmt-en-vi/raw/master/data/'
  r = requests.get(url + train_test + '-en-vi.tgz')
  filename = train_test + '-en-vi.tar.gz'
  with open(filename, 'wb') as f:
    f.write(r.content)
    tarfile.open(filename, 'r:gz').extractall('iwslt15')
iwslt15('train')
iwslt15('test-2013')

#**Parameter**

In this section, we set up some parameters which are used many time to solved prolem 3.

In [None]:
MODEL_NAME = 'transfomer.model'
EPOCH = 20
BATCHSIZE = 1500
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
opt = {
    'train_src_data':'/content/iwslt15/train.en',
    'train_trg_data':'/content/iwslt15/train.vi',
    'valid_src_data':'/content/iwslt15/tst2013.en',
    'valid_trg_data':'/content/iwslt15/tst2013.vi',
    'src_lang':'en',
    'trg_lang': 'vi_spacy_model',#'vi'
    'max_strlen':160,
    'd_model': 512,
    'n_layers': 6,
    'heads': 8,
    'dropout': 0.1,
    'k':5,
}

#**Preproccesing**

In [None]:
# read_data function which help read data from file which are downloaded in section Dataset and split the sentence.
def read_data(src_file, trg_file):
    src_data = open(src_file).read().strip().split('\n')
    trg_data = open(trg_file).read().strip().split('\n')
    return src_data, trg_data

In [None]:
# pre-processing with pun marks, special characters, and lowercase the word.
# we use the spacy library to tokennize the word in sentence. 
class tokenize(object):
    def __init__(self, lang):
        self.nlp = spacy.load(lang)
    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [None]:
train_src_data, train_trg_data = read_data(opt['train_src_data'], opt['train_trg_data'])
valid_src_data, valid_trg_data = read_data(opt['valid_src_data'], opt['valid_trg_data'])
for i in range (0, 10):
  k = tokenize('en')
  print (k.tokenizer(valid_src_data[i]))

['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '&', 'quot', 'nothing', 'to', 'envy', '.', '&', 'quot']
['and', 'i', 'was', 'very', 'proud', '.']
['in', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'studying', 'the', 'history', 'of', 'kim', 'il', 'sung', ',', 'but', 'we', 'never', 'learned', 'much', 'about', 'the', 'outside', 'world', ',', 'except', 'that', 'america', ',', 'south', 'korea', ',', 'japan', 'are', 'the', 'enemies', '.']
['although', 'i', 'often', 'wondered', 'about', 'the', 'outside', 'world', ',', 'i', 'thought', 'i', 'would', 'spend', 'my', 'entire', 'life', 'in', 'north', 'korea', ',', 'until', 'everything', 'suddenly', 'changed', '.']
['when', 'i', 'was', 'seven', 'years', 'old', ',', 'i', 'saw', 'my', 'first', 'public', 'execution', ',', 'but', 'i', 'thought', 'my', 'life', 'in', 'north', 'korea', 'was', 'normal', '.']
['my', '

In [None]:
for i in range (0, 10):
  k = tokenize('vi_spacy_model')
  print (k.tokenizer(valid_trg_data[i]))

['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&', 'quot', 'chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&', 'quot']
['tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'đất', 'nước', 'tôi', '.']
['ở', 'trường', ',', 'chúng', 'tôi', 'dành', 'rất', 'nhiều', 'thời', 'gian', 'để', 'học', 'về', 'cuộc', 'đời', 'của', 'chủ', 'tịch', 'kim', 'ii', 'sung', ',', 'nhưng', 'lại', 'không', 'học', 'nhiều', 'về', 'thế', 'giới', 'bên', 'ngoài', ',', 'ngoại', 'trừ', 'việc', 'hoa', 'kỳ', ',', 'hàn', 'quốc', 'và', 'nhật', 'bản', 'là', 'kẻ', 'thù', 'của', 'chúng', 'tôi', '.']
['mặc', 'dù', 'tôi', 'đã', 'từng', 'tự', 'hỏi', 'không', 'biết', 'thế', 'giới', 'bên', 'ngoài', 'kia', 'như', 'thế', 'nào', ',', 'nhưng', 'tôi', 'vẫn', 'nghĩ', 'rằng', 'mình', 'sẽ', 'sống', 'cả', 'cuộc', 'đời', 'ở', 'bắctriều', 'tiên', ',', 'cho', 'tới', 'khi', 'tất', 'cả', 'mọi', 'thứ', 'đột', 'nhiên',

In [None]:
# We use torchtext to load data, which reduces time and efficiency
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [None]:
def create_fields(src_lang, trg_lang):    
    t_src = tokenize(src_lang)
    t_trg = tokenize(trg_lang)
    """
    The parameter in data.Filed():
    init_token: A token that will be prepended to every example using this
            field, or None for no initial token. Default: None.
    eos_token: A token that will be appended to every example using this
            field, or None for no end-of-sentence token. Default: None.
    lower: Whether to lowercase the text in this field. Default: False.
    tokenize: The function used to tokenize strings using this field into
            sequential examples. If "spacy", the SpaCy tokenizer is
            used. If a non-serializable function is passed as an argument,
            the field will not be able to be serialized. Default: string.split.
    """
    TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
    SRC = data.Field(lower=True, tokenize=t_src.tokenizer) 
    return SRC, TRG
"""
    MyIterator function: Defines an iterator that loads batches of data from a Dataset
    batch_size: Batch size.
    batch_size_fn: Function of three arguments (new example to add, current
        count of examples in the batch, and current effective batch size)
        that returns the new effective batch size resulting from adding
        that example to a batch. This is useful for dynamic batching, where
        this function would add to the current effective batch size the
        number of tokens in the new example.
    shuffle: Whether to shuffle examples between epochs.
    sort_key: A key to use for sorting examples in order to batch together
            examples with similar lengths and minimize padding. The sort_key
            provided to the Iterator constructor overrides the sort_key
            attribute of the Dataset, or defers to it if None.
"""
def create_dataset(src_data, trg_data, max_strlen, batchsize, device, SRC, TRG, istrain=True):
    print("creating dataset and iterator... ")
    raw_data = {'src' : [line for line in src_data], 'trg': [line for line in trg_data]}
    df = pd.DataFrame(raw_data, columns=["src", "trg"])    
    mask = (df['src'].str.count(' ') < max_strlen) & (df['trg'].str.count(' ') < max_strlen)
    df = df.loc[mask]
    df.to_csv("translate_transformer_temp.csv", index=False)
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
    train_iter = MyIterator(train, batch_size=batchsize, device=device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=istrain, shuffle=True)
    #os.remove('translate_transformer_temp.csv')
    if istrain:
        SRC.build_vocab(train)
        TRG.build_vocab(train)
    return train_iter

In [None]:
SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'])
train_iter = create_dataset(train_src_data, train_trg_data, opt['max_strlen'], BATCHSIZE, DEVICE, SRC, TRG, istrain=True)
valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['max_strlen'], BATCHSIZE, DEVICE, SRC, TRG, istrain=False)

creating dataset and iterator... 
creating dataset and iterator... 


#**Model**

In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, 512)
        #self.encemb = torch.nn.Embedding(len(vocablist_x), 256, padding_idx = vocabidx_x['<pad>'])
    def forward(self, x):
        return self.embed(x)

In [None]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_length=200, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_length, d_model)
        # Bảng pe mình vẽ ở trên 
        for pos in range(max_seq_length):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos/(10000**(2*i/d_model)))
                pe[pos, i+1] = math.cos(pos/(10000**((2*i+1)/d_model)))
        pe = pe.unsqueeze(0)        
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x*math.sqrt(self.d_model)
        seq_length = x.size(1)
        pe = Variable(self.pe[:, :seq_length], requires_grad=False)
        if x.is_cuda:
            pe.cuda()
        # cộng embedding vector với pe 
        x = x + pe
        x = self.dropout(x)
        return x

In [None]:
def attention(q, k, v, mask=None, dropout=None):
    """
    q: batch_size x head x seq_length x d_model
    k: batch_size x head x seq_length x d_model
    v: batch_size x head x seq_length x d_model
    mask: batch_size x 1 x 1 x seq_length
    output: batch_size x head x seq_length x d_model
    """

    # attention score is caculaet by multiply q and k
    d_k = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask==0, -1e9)
    #use softmax standardized
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
    
    output = torch.matmul(scores, v)
    return output, scores

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        assert d_model % heads == 0
        self.d_model = d_model
        self.d_k = d_model//heads
        self.h = heads
        self.attn = None

        # create 3 matrix q_linear, k_linear, v_linear
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        """
        q: batch_size x seq_length x d_model
        k: batch_size x seq_length x d_model
        v: batch_size x seq_length x d_model
        mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        bs = q.size(0)
        # multiply matrix q_linear, k_linear, v_linear with q, k, v 
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)
        
        # calculate attention score
        scores, self.attn = attention(q, k, v, mask, self.dropout)
        
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        
        output = self.out(concat)
        return output

In [None]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [None]:
class FeedForward(nn.Module):
    """ we have linear in my architecture
    """
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
    
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        """
        x: batch_size x seq_length x d_model
        mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        x2 = self.norm_1(x)
        # calculate attention value  
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        """
        x: batch_size x seq_length x d_model
        e_outputs: batch_size x seq_length x d_model
        src_mask: batch_size x 1 x seq_length
        trg_mask: batch_size x 1 x seq_length
        """
        # we build as architecture that was describe in report
        x2 = self.norm_1(x)
        
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)

        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

In [None]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    """
    Encoder has many encoder layer
    """
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
        
    def forward(self, src, mask):
        """
        src: batch_size x seq_length
        mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)

In [None]:
class Decoder(nn.Module):
    """
    Decoder has many decoder layer
    """
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        """
        trg: batch_size x seq_length
        e_outputs: batch_size x seq_length x d_model
        src_mask: batch_size x 1 x seq_length
        trg_mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [None]:
class Transformer(nn.Module):
    """ 
    Put together to complete the model
    """
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        """
        src: batch_size x seq_length
        trg: batch_size x seq_length
        src_mask: batch_size x 1 x seq_length
        trg_mask batch_size x 1 x seq_length
        output: batch_size x seq_length x vocab_size
        """
        e_outputs = self.encoder(src, src_mask)
        
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output
    def evaluate(valid_src_data, valid_trg_data, model, SRC, TRG, device, k, max_strlen):
        pred_sents = []
        for sentence in valid_src_data:
          pred_trg = translate_sentence(sentence, model, SRC, TRG, device, k, max_strlen)
          pred_sents.append(pred_trg)
        pred_sents = [TRG.preprocess(sent) for sent in pred_sents]
        return pred_sents

In [None]:
def nopeak_mask(size, device):
    """
    Create masks used in decoder to predict during training models that do not see words in the future
    """
    np_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
    np_mask = np_mask.to(device)
    
    return np_mask

def create_masks(src, trg, src_pad, trg_pad, device):
    """
     Create mask for encoder, so that the model does not ignore the information of the PAD characters we add
    """
    src_mask = (src != src_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, device)
        if trg.is_cuda:
            np_mask.cuda()
        trg_mask = trg_mask & np_mask
        
    else:
        trg_mask = None
    return src_mask, trg_mask

In [None]:
def get_synonym(word, SRC):
    syns = wordnet.synsets(word)
    for s in syns:
        for l in s.lemmas():
            if SRC.vocab.stoi[l.name()] != 0:
                return SRC.vocab.stoi[l.name()]
            
    return 0

def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [None]:
def init_vars(src, model, SRC, TRG, device, k, max_len):
    """ Tính toán các ma trận cần thiết trong quá trình translation sau khi mô hình học xong
    """
    init_tok = TRG.vocab.stoi['<sos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
 
    e_output = model.encoder(src, src_mask)
    
    outputs = torch.LongTensor([[init_tok]])
    
    outputs = outputs.to(device)
    
    trg_mask = nopeak_mask(1, device)
    #  predict the first word
    out = model.out(model.decoder(outputs,
    e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    
    probs, ix = out[:, -1].data.topk(k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    
    outputs = torch.zeros(k, max_len).long()
    outputs = outputs.to(device)
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(k, e_output.size(-2),e_output.size(-1))
   
    e_outputs = e_outputs.to(device)
    e_outputs[:, :] = e_output[0]
    
    return outputs, e_outputs, log_scores

def k_best_outputs(outputs, out, log_scores, i, k):
    
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores

def beam_search(src, model, SRC, TRG, device, k, max_len):    

    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, device, k, max_len)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None
    for i in range(2, max_len):
    
        trg_mask = nopeak_mask(i, device)

        out = model.out(model.decoder(outputs[:,:i],
        e_outputs, src_mask, trg_mask))

        out = F.softmax(out, dim=-1)
    
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, k)
        
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol

        num_finished_sentences = len([s for s in sentence_lengths if s > 0])

        if num_finished_sentences == k:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    
    if ind is None:
        
        length = (outputs[0]==eos_tok).nonzero()[0] if len((outputs[0]==eos_tok).nonzero()) > 0 else -1
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
    
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])

In [None]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.init_lr = init_lr
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0


    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        "Zero out the gradients with the inner optimizer"
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))

    def state_dict(self):
        optimizer_state_dict = {
            'init_lr':self.init_lr,
            'd_model':self.d_model,
            'n_warmup_steps':self.n_warmup_steps,
            'n_steps':self.n_steps,
            '_optimizer':self._optimizer.state_dict(),
        }
        
        return optimizer_state_dict
    
    def load_state_dict(self, state_dict):
        self.init_lr = state_dict['init_lr']
        self.d_model = state_dict['d_model']
        self.n_warmup_steps = state_dict['n_warmup_steps']
        self.n_steps = state_dict['n_steps']
        
        self._optimizer.load_state_dict(state_dict['_optimizer'])
        
    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, padding_idx, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim
        self.padding_idx = padding_idx

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 2))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            true_dist[:, self.padding_idx] = 0
            mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
            if mask.dim() > 0:
                true_dist.index_fill_(0, mask.squeeze(), 0.0)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [None]:
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']

#**Predict**

In [None]:
def translate_sentence(sentence, model, SRC, TRG, device, k, max_len):
    """
    Use beamsearch to predict 1 sentence
    """
    model.eval()
    indexed = []
    sentence = SRC.preprocess(sentence)
    
    for tok in sentence:
        if SRC.vocab.stoi[tok] != SRC.vocab.stoi['<eos>']:
            indexed.append(SRC.vocab.stoi[tok])
        else:
            indexed.append(get_synonym(tok, SRC))
    
    sentence = Variable(torch.LongTensor([indexed]))
    sentence = sentence.to(device)
    sentence = beam_search(sentence, model, SRC, TRG, device, k, max_len)

    return  multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)

# **Train**

In [None]:
def train():
    model = Transformer(len(SRC.vocab), len(TRG.vocab), opt['d_model'], opt['n_layers'], opt['heads'], opt['dropout'])
    for p in model.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)
    model = model.to(DEVICE)
    optimizer = ScheduledOptim(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        0.2, opt['d_model'], 4000)
    criterion = LabelSmoothingLoss(len(TRG.vocab), padding_idx=trg_pad, smoothing=0.1)
    for epoch in range(EPOCH):
        loss = 0
        step = 0
        for i, batch in enumerate(train_iter): 
            model.train()
            src = batch.src.transpose(0,1).cuda()
            trg = batch.trg.transpose(0,1).cuda()
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, DEVICE)
            preds = model(src, trg_input, src_mask, trg_mask)
            ys = trg[:, 1:].contiguous().view(-1)
            optimizer.zero_grad()

            batchloss = criterion(preds.view(-1, preds.size(-1)), ys)
            batchloss.backward()
            optimizer.step_and_update_lr()
            loss = loss + batchloss.item()
            if step % 200 == 0:
                print("step {}, batchloss = {}".format(step, batchloss.item()))
            step += 1
        print("Epoch {} with loss = {}".format(epoch, loss))
    torch.save(model, MODEL_NAME)

In [None]:
train()

step 0, batchloss = 9.724188804626465
step 200, batchloss = 9.25448989868164
step 400, batchloss = 8.050572395324707
step 600, batchloss = 6.950458526611328
step 800, batchloss = 6.637948036193848
step 1000, batchloss = 6.522841453552246
step 1200, batchloss = 6.379954814910889
step 1400, batchloss = 6.058230876922607
step 1600, batchloss = 5.565131187438965
step 1800, batchloss = 5.842733383178711
step 2000, batchloss = 4.962292194366455
step 2200, batchloss = 5.184659481048584
step 2400, batchloss = 5.210153102874756
Epoch 0 with loss = 16199.504356503487
step 0, batchloss = 5.459299564361572
step 200, batchloss = 5.389187335968018
step 400, batchloss = 5.249549388885498
step 600, batchloss = 4.884586811065674
step 800, batchloss = 4.963166236877441
step 1000, batchloss = 4.6999311447143555
step 1200, batchloss = 4.738482475280762
step 1400, batchloss = 4.971249103546143
step 1600, batchloss = 4.57490348815918
step 1800, batchloss = 4.358055114746094
step 2000, batchloss = 4.89203929

# **Test**

In [None]:
from torchtext.data.metrics import bleu_score
def test():
  total = 0
  correct = 0
  model = torch.load(MODEL_NAME)
  model.eval()
  ref = []
  pred_sents = []
  for i in range(len(valid_trg_data)):
    k = tokenize('en')
    en = (k.tokenizer(valid_src_data[i]))
    k = tokenize('vi_spacy_model')
    vi = (k.tokenizer(valid_trg_data[i]))
    print ('INPUT', en)
    print ('REF', vi)
    pred_trg = translate_sentence(valid_src_data[i], model, SRC, TRG, DEVICE, opt['k'], opt['max_strlen'])
    pred_sents.append(pred_trg)
    print ('MT', [pred_trg.split()])
  pred_sents = [TRG.preprocess(sent) for sent in pred_sents]
  trg_sents = [[sent.split()] for sent in valid_trg_data]
  bleu = bleu_score(pred_sents, trg_sents)
  print ('total: ', len(valid_trg_data))
  print ('bleu:', bleu)
test()

INPUT ['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '&', 'quot', 'nothing', 'to', 'envy', '.', '&', 'quot']
REF ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&', 'quot', 'chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&', 'quot']


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


MT [['khi', 'tôi', 'còn', 'nhỏ,', 'tôi', 'nghĩ', 'đất', 'nước', 'của', 'tôi', 'là', 'nơi', 'tốt', 'nhất', 'trên', 'hành', 'tinh,', 'và', 'tôi', 'lớn', 'lên', 'hát', 'một', 'bài', 'hát', 'có', 'tên', 'là', '&', 'quot', 'không', 'có', 'gì', 'phải', 'ghen', 'tị.', '&', 'quot']]
INPUT ['and', 'i', 'was', 'very', 'proud', '.']
REF ['tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'đất', 'nước', 'tôi', '.']
MT [['và', 'tôi', 'rất', 'tự', 'hào.']]
INPUT ['in', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'studying', 'the', 'history', 'of', 'kim', 'il', 'sung', ',', 'but', 'we', 'never', 'learned', 'much', 'about', 'the', 'outside', 'world', ',', 'except', 'that', 'america', ',', 'south', 'korea', ',', 'japan', 'are', 'the', 'enemies', '.']
REF ['ở', 'trường', ',', 'chúng', 'tôi', 'dành', 'rất', 'nhiều', 'thời', 'gian', 'để', 'học', 'về', 'cuộc', 'đời', 'của', 'chủ', 'tịch', 'kim', 'ii', 'sung', ',', 'nhưng', 'lại', 'không', 'học', 'nhiều', 'về', 'thế', 'giới', 'bên', 'ngoài', ',', 'ngoại', '