In [32]:
# refer: github.com/pbcquoc
import numpy as np
import datasets
from torch.utils.data import Dataset, DataLoader
import torch, os, math, copy
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# Positional Encoder
(this trick makes transformer and variants awesome but how ?)

In [2]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        embed = self.embedding(x)
        return embed

class PosisionalEncoder(nn.Module):
    def __init__(self, d_model=768, max_seq_len=256, dropout=0.1):
        super(PosisionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        self.dropout = nn.Dropout(dropout)
        
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0,d_model,2):
                pe[pos, i] = math.sin(pos / (10000 ** (2*i/d_model)))
                pe[pos, i+1] = math.cos(pos / (10000 ** (2*i/d_model)))
        pe = pe.unsqueeze(0)
        # this makes pe is not trained/updated by optimizer
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        seq_len = x.size(1)
        pe = Variable(self.pe[:, :seq_len], requires_grad=False)
        if x.is_cuda:
            pe.cuda()
        x = self.dropout(x + pe)
        return x
PosisionalEncoder(512)(torch.rand(5, 30, 512)).shape    

torch.Size([5, 30, 512])

# Multihead Attention operator
(awesome feature extractor)

In [3]:
class MultiheadAttention(nn.Module):
    def __init__(self, n_heads, d_model, dropout=None):
        super(MultiheadAttention, self).__init__()
        assert d_model % n_heads == 0
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.dropout = nn.Dropout(dropout) if dropout else None
        
        # init mattrix weights for key, query and value
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        
        self.out = nn.Linear(d_model, d_model)
    def forward(self, q, k, v, mask=None):
        """
        Parameters:
        -----------
        q: tensor shape `(batch_size, seq_len, d_model)`
        k: tensor shape `(batch_size, seq_len, d_model)`
        v: tensor shape `(batch_size, seq_len, d_model)`
        mask: tensor shape `(batch_size, 1, seq_len)`, the mask of self-attn layer at Decoder
        Return:
        -------
        output: tensor shape `(batch_size, seq_len, d_model)`
        """
        # calculate query, key, value vector from weight mattrix
        batch_size = q.size(0)
        q = self.q_linear(q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # perfrom scale-dot attention op
        score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None: 
            mask = mask.unsqueeze(1)
            score = score.masked_fill(mask==0, -1e9)
        score = F.softmax(score, -1)
        if self.dropout: 
            output = self.dropout(score)
        output = torch.matmul(score, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.out(output)
        return output

MultiheadAttention(8, 512, 0.1)(torch.rand(8, 30, 512), torch.rand(8, 30, 512), torch.rand(8, 30, 512)).shape

torch.Size([8, 30, 512])

# Residual connection and Layer normalization
(faster converge and avoid losing information)

In [4]:
class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
    
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(d_model))
        self.bias = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

Norm(512)(torch.randn(8, 128, 512)).shape

torch.Size([8, 128, 512])

In [5]:
class FeedForward(nn.Module):
    def __init__(self, d_model=512, d_ff=2048, dropout=0.1):
        super(FeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model))
    def forward(self, x):
        out = self.ff(x)
        return out
FeedForward()(torch.randn(8,128,512)).shape

torch.Size([8, 128, 512])

# Encoder, Decoder block

In [7]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiheadAttention(n_heads, d_model, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
    def forward(self, x, mask):
        """
        Parameters:
        -----------
        x: tensor shape `(batch_size, seq_len, model_dim)`
        mask: tensor shape `(batch_size, 1, model_dim)` for mask self-attention
        Return:
        -------
        out: tensor shape `(batch_size, seq_len, model_dim)`
        """
        x_norm = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x_norm, x_norm, x_norm, mask))
        x_norm = self.norm_2(x)
        x = x = self.dropout_2(self.ff(x_norm))
        return x
net = EncoderBlock(512, 8, 2048)
net(torch.randn(8, 30, 512), torch.randn(8, 1, 30)).shape

torch.Size([8, 30, 512])

In [8]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.attn_1 = MultiheadAttention(n_heads, d_model, dropout)
        self.attn_2 = MultiheadAttention(n_heads, d_model, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        """
        Parameters:
        -----------
        x: tensor input of target batch sentences
            shape `(batch_size, seq_len, d_model)`
        encoder_output: tensor output (contextual embedding) of encoder block
            shape `(batch_size, seq_len, d_model)`
        src_mask: tensor mask for encoder output
            shape `(batch_size, 1, seq_len)`
        tgt_mask: tensor for hide the future represented of predicted token from current step
            shape `(batch_size, 1, seq_len)`
        Return:
        -------
        out: tensor, contextual embedding of sentence
            shape `(batch_size, seq_len, d_model)`
        """
        x_norm = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x_norm, x_norm, x_norm, tgt_mask))
        
        # get corr between current token embedding of decoder with all token embedding from encoder
        x_norm = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x_norm, encoder_output, encoder_output, src_mask))
        
        x_norm = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x_norm))
        return x
net = DecoderBlock(512, 8, 2048)
net(torch.randn(8, 30, 512), torch.randn(8, 30, 512), torch.randn(8, 1, 30), torch.randn(8, 1, 30)).shape

torch.Size([8, 30, 512])

# Build Transformers

In [9]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer, dropout=0.1):
        super(Encoder, self).__init__()
        self.N = num_layer
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PosisionalEncoder(d_model, max_seq_len, dropout)
        self.layers = get_clones(EncoderBlock(d_model, n_heads, d_ff, dropout), num_layer)
        self.norm = Norm(d_model)
    def forward(self, x, mask):
        """
        Parameters:
        -----------
        x: tensor, token idx of input sents
            shape `(batch_size, seq_len)`
        mask: tensor, shape `(batch_size, 1, seq_len)`
        Return:
        -------
        out: tensor, shape `(batch_size, seq_len, d_model)`
        """
        out = self.embed(x)
        out = self.pe(out)
        for i in range(self.N):
            out = self.layers[i](out, mask)
        out = self.norm(out)
        return out
en_vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer = 256, 30, 512, 8, 2048, 6
net = Encoder(en_vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer)
net(torch.LongTensor(8, max_seq_len).random_(0, en_vocab_size), torch.rand(8, 1, max_seq_len)).shape

torch.Size([8, 30, 512])

In [11]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer, dropout=0.1):
        super(Decoder, self).__init__()
        self.N = num_layer
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PosisionalEncoder(d_model, max_seq_len, dropout)
        self.layers = get_clones(DecoderBlock(d_model, n_heads, d_ff, dropout), num_layer)
        self.norm = Norm(d_model)
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        """
        Parameters:
        -----------
        x: tensor, token idx of input sents
            shape `(batch_size, seq_len)`
        encoder_output: tensor, contextual embedding for input sents
            shape `(batch_size, seq_len, d_model)`
        src_mask, tgt_mask: tensor, shape `(batch_size, 1, seq_len)`
            2 mask for sentence decoder and encoder respectively
        Return:
        -------
        out: contextual embedding of whole predicted sents
        """
        out = self.embed(x)
        out = self.pe(out)
        for i in range(self.N):
            out = self.layers[i](out, encoder_output, src_mask, tgt_mask)
        out = self.norm(out)
        return out
de_vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer = 256, 30, 512, 8, 2048, 6
net = Decoder(de_vocab_size, max_seq_len, d_model, n_heads, d_ff, num_layer)
net(torch.LongTensor(8, max_seq_len).random_(0, de_vocab_size), torch.rand(8, max_seq_len, d_model), torch.randn(8, 1, max_seq_len), torch.randn(8, 1, max_seq_len)).shape    

torch.Size([8, 30, 512])

In [12]:
class Transformer(nn.Module):
    def __init__(self, en_config, de_config):
        super(Transformer, self).__init__()
        self.encoder = Encoder(**en_config)
        self.decoder = Decoder(**de_config)
        self.fc = nn.Linear(de_config["d_model"], de_config["vocab_size"])
    def forward(self, src_sent, tgt_sent, src_mask, tgt_mask):
        encoder_output = self.encoder(src_sent, src_mask)
        decoder_output = self.decoder(tgt_sent, encoder_output, src_mask, tgt_mask)
        out = self.fc(decoder_output)
        return out

In [13]:
en_config = {
    "vocab_size": 256,
    "max_seq_len": 30,
    "d_model": 512,
    "n_heads": 8,
    "d_ff": 2048 ,
    "num_layer": 6}
de_config = {
    "vocab_size": 128,
    "max_seq_len": 18,
    "d_model": 512,
    "n_heads": 8,
    "d_ff": 2048 ,
    "num_layer": 6}

In [14]:
batch_size = 8
en_seq_len = en_config["max_seq_len"]
de_seq_len = de_config["max_seq_len"]
en_vocab_size = en_config["vocab_size"]
de_vocab_size = de_config["vocab_size"]

net = Transformer(en_config, de_config)
print(sum(p.numel() for p in net.parameters() if p.requires_grad))

prob_map = net(src_sent=torch.LongTensor(batch_size, en_seq_len).random_(0, en_vocab_size),\
                tgt_sent=torch.LongTensor(batch_size, de_seq_len).random_(0, de_vocab_size),\
                src_mask=torch.randn(batch_size, 1, en_seq_len),\
                tgt_mask=torch.randn(batch_size, 1, de_seq_len))
prob_map.shape

44402816


torch.Size([8, 18, 128])

In [15]:
prob_map[0,0,:]

tensor([-0.9755,  0.2224,  0.9287,  0.0244,  0.2923, -0.2626,  0.1573,  0.2836,
        -0.1178,  0.0238, -0.1071, -0.2879, -0.8419,  0.3327, -1.4278, -0.0689,
        -0.3479, -0.0578,  0.8049,  0.0503, -0.2636, -0.1771, -0.2160, -0.1603,
        -0.8581,  0.7816, -0.2043, -0.2255,  0.1166,  0.2672, -0.4785,  1.2773,
        -0.5677, -0.8595, -0.6009, -0.3740,  0.5364, -1.1635, -1.0674,  0.2244,
         0.5846, -1.0198, -0.5836, -0.1003,  0.3574, -0.5686, -0.5007, -0.6308,
        -0.5512,  0.1604, -0.8893, -0.8032, -0.3839, -0.1801, -0.9138, -0.5978,
        -0.1915, -0.7484, -0.9814,  0.3790,  0.4067, -0.2287, -0.6264, -0.2831,
         0.7580,  1.2947, -0.4937, -1.1627, -0.1759, -0.0814,  0.0233, -0.3918,
         0.5818, -0.3665,  0.1917,  0.1112, -0.1125,  0.8809, -0.3027,  1.2463,
         0.9553,  1.2005, -0.0825, -0.7323, -0.7041,  0.3189, -0.5870,  0.7770,
         0.7898, -0.0901,  0.0709,  0.2313, -1.0618,  0.0791, -0.2431, -0.0104,
         0.6020,  0.1511, -0.4289,  0.09

# Load dataset

In [None]:
import datasets

dataset = datasets.load_dataset("nam194/vietnews")
dataset

In [68]:
from torchtext.data import Iterator

class MyIterator(Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b

            self.batches = pool(self.data(), self.random_shuffler)

        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))


global max_src_in_batch, max_tgt_in_batch


def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch, len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [69]:
def nopeak_mask(size, device):
    """Tạo mask được sử dụng trong decoder để lúc dự đoán trong quá trình huấn luyện
     mô hình không nhìn thấy được các từ ở tương lai
    """
    np_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
    np_mask = np_mask.to(device)
    
    return np_mask

def create_masks(src, trg, src_pad, trg_pad, device):
    """ Tạo mask cho encoder, 
    để mô hình không bỏ qua thông tin của các kí tự PAD do chúng ta thêm vào 
    """
    src_mask = (src != src_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, device)
        if trg.is_cuda:
            np_mask.cuda()
        trg_mask = trg_mask & np_mask
        
    else:
        trg_mask = None
    return src_mask, trg_mask

In [70]:
from nltk.corpus import wordnet
import re

def get_synonym(word, SRC):
    syns = wordnet.synsets(word)
    for s in syns:
        for l in s.lemmas():
            if SRC.vocab.stoi[l.name()] != 0:
                return SRC.vocab.stoi[l.name()]
            
    return 0

def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

In [71]:
def init_vars(src, model, SRC, TRG, device, k, max_len):
    """ Tính toán các ma trận cần thiết trong quá trình translation sau khi mô hình học xong
    """
    init_tok = TRG.vocab.stoi['<sos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)

    # tính sẵn output của encoder 
    e_output = model.encoder(src, src_mask)
    
    outputs = torch.LongTensor([[init_tok]])
    
    outputs = outputs.to(device)
    
    trg_mask = nopeak_mask(1, device)
    # dự đoán kí tự đầu tiên
    out = model.out(model.decoder(outputs,
    e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    
    probs, ix = out[:, -1].data.topk(k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    
    outputs = torch.zeros(k, max_len).long()
    outputs = outputs.to(device)
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(k, e_output.size(-2),e_output.size(-1))
   
    e_outputs = e_outputs.to(device)
    e_outputs[:, :] = e_output[0]
    
    return outputs, e_outputs, log_scores

def k_best_outputs(outputs, out, log_scores, i, k):
    
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores

def beam_search(src, model, SRC, TRG, device, k, max_len):    

    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, device, k, max_len)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None
    for i in range(2, max_len):
    
        trg_mask = nopeak_mask(i, device)

        out = model.out(model.decoder(outputs[:,:i],
        e_outputs, src_mask, trg_mask))

        out = F.softmax(out, dim=-1)
    
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, k)
        
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol

        num_finished_sentences = len([s for s in sentence_lengths if s > 0])

        if num_finished_sentences == k:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    
    if ind is None:
        
        length = (outputs[0]==eos_tok).nonzero()[0] if len((outputs[0]==eos_tok).nonzero()) > 0 else -1
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
    
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])

In [72]:
def translate_sentence(sentence, model, SRC, TRG, device, k, max_len):
    """Dịch một câu sử dụng beamsearch
    """
    model.eval()
    indexed = []
    sentence = SRC.preprocess(sentence)
    
    for tok in sentence:
        if SRC.vocab.stoi[tok] != SRC.vocab.stoi['<eos>']:
            indexed.append(SRC.vocab.stoi[tok])
        else:
            indexed.append(get_synonym(tok, SRC))
    
    sentence = Variable(torch.LongTensor([indexed]))
    
    sentence = sentence.to(device)
    
    sentence = beam_search(sentence, model, SRC, TRG, device, k, max_len)

    return  multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)

In [82]:
import spacy
import re


class tokenize(object):

    def __init__(self, lang):
        self.nlp = spacy.load(lang)

    def tokenizer(self, sentence):
        sentence = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

ImportError: cannot import name 'deprecated' from 'typing_extensions' (C:\Users\Admin\anaconda3\lib\site-packages\typing_extensions.py)

In [81]:
!pip install -U typing_extensions
!pip install -U deprecated

Collecting typing_extensions
  Obtaining dependency information for typing_extensions from https://files.pythonhosted.org/packages/ec/6b/63cc3df74987c36fe26157ee12e09e8f9db4de771e0f3404263117e75b95/typing_extensions-4.7.1-py3-none-any.whl.metadata
  Using cached typing_extensions-4.7.1-py3-none-any.whl.metadata (3.1 kB)
Using cached typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Installing collected packages: typing_extensions
  Attempting uninstall: typing_extensions
    Found existing installation: typing_extensions 4.0.1
    Not uninstalling typing-extensions at c:\users\admin\appdata\local\programs\python\python39\lib\site-packages, outside environment C:\Users\Admin\Desktop\transformer_pointer_generator_network\venv
    Can't uninstall 'typing_extensions'. No files were found to uninstall.
Successfully installed typing_extensions-4.7.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.10.0 requires protobuf<4,>=3.12, but you have protobuf 4.24.0 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.7.1 which is incompatible.
torchaudio 0.13.1 requires torch==1.13.1, but you have torch 2.0.1 which is incompatible.
torchvision 0.14.1 requires torch==1.13.1, but you have torch 2.0.1 which is incompatible.


