# Cài đặt các thư viện sử dụng

In [None]:
%pip install prenlp
%pip install torch
%pip install tqdm
%pip install requests
%pip install -q gradio
!pip install prenlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting prenlp
  Downloading prenlp-0.0.13-py3-none-any.whl (30 kB)
Collecting nltk==3.2.5 (from prenlp)
  Downloading nltk-3.2.5.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting konlpy (from prenlp)
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from prenlp)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ijson (from prenlp)
  Downloading ijson-3.2.0.post0-cp310-cp310-manylinux_2_17_

### Mở 2 file chứa dataset vi_sents và en_sents, tạo một file mới en-to-vi.txt chứa dataset được xử lý từ 2 file trên

In [None]:
vi = open('./data/vi_sents')
en = open('./data/en_sents')
en2vi = open('./data/en-to-vi.txt', 'a+')

In [None]:
i = 0

### Nối từng dòng tương ứng của file vi_sents và file en_sents với nhau theo định dạng en_line => vi_line, ghi kết quả vào file en-to-vi.txt

In [None]:
for vi_line, en_line in zip(vi, en):
    en2vi.write(f'{en_line.strip()} => {vi_line.strip()}\n')
    i = i + 1
    if i == 32768:
        break

# Import các thư viện sử dụng

In [None]:
from typing import List
from collections import OrderedDict
import zipfile
import pathlib

from prenlp.tokenizer import SentencePiece
import os
import zipfile
import requests
from tqdm import tqdm
import random
import numpy as np
import time

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import pathlib

# Config

In [None]:
no_cuda = False
multi_gpu = False
max_seq_len = 192
d_model = 384
n_layers = 12
n_heads = 8
dropout = 0.1
d_ff = 1280
batch_size = 128
vocab_size = 10800

In [None]:
device = 'cuda' if torch.cuda.is_available() and not no_cuda else 'cpu'

# Định nghĩa class Tokenizer

In [None]:
class Tokenizer:
  def __init__(self, tokenizer, vocab_file: str, 
               pad_token: str = '[PAD]',
               unk_token: str = '[UNK]',
               bos_token: str = '[BOS]',
               eos_token: str = '[EOS]'):
    self.tokenizer = tokenizer
    self.pad_token = pad_token
    self.unk_token = unk_token
    self.bos_token = bos_token
    self.eos_token = eos_token
    self.vocab = OrderedDict()
    self.ids_to_tokens = OrderedDict()

    with open(vocab_file, 'r', encoding='utf-8') as reader:
      for i, line in enumerate(reader.readlines()):
        token = line.split()[0]
        self.vocab[token] = i

    for token, id in self.vocab.items():
      self.ids_to_tokens[id] = token

  def tokenize(self, text: str) -> List[str]:
    return self.tokenizer(text)

  def convert_token_to_id(self, token: str) -> int:
    return self.vocab.get(token, self.vocab.get(self.unk_token))

  def convert_id_to_token(self, id: int) -> str:
    return self.ids_to_tokens(id, self.unk_token)

  def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
    return [self.convert_token_to_id(token) for token in tokens]

  def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
    return [self.convert_id_to_token(id) for id in ids]


  @property
  def vocab_size(self) -> int:
    return len(self.vocab)

  @property
  def pad_token_id(self) -> int:
    return self.convert_token_to_id(self.pad_token)

  @property
  def unk_token_id(self) -> int:
    return self.convert_token_to_id(self.unk_token)

  @property
  def bos_token_id(self) -> int:
    return self.convert_token_to_id(self.bos_token)

  @property
  def eos_token_id(self) -> int:
    return self.convert_token_to_id(self.eos_token)

### Định nghĩa class PretrainedTokenizer, load tokenizer đã được huấn luyện từ trước

In [None]:
class PretrainedTokenizer(Tokenizer):
  def __init__(self, pretrained_model: str, vocab_file: str,
               pad_token: str = '[PAD]',
               unk_token: str = '[UNK]',
               bos_token: str = '[BOS]',
               eos_token: str = '[EOS]'):
    tokenizer = SentencePiece.load(pretrained_model)

    super(PretrainedTokenizer, self).__init__(tokenizer, vocab_file, pad_token, unk_token, bos_token, eos_token)

  def detokenize(self, tokens: List[str]) -> str:
    return self.tokenizer.detokenize(tokens)

### Build tokenizer sử dụng thuật toán bpe trên tập dataset, output thu được lưu vào file tok.vocab và tok.model

In [None]:
SentencePiece.train(input='./data/en-to-vi.txt', vocab_size=vocab_size, model_type='bpe', model_prefix='tok', max_sentence_length=max_seq_len + 1)

### Load tokenizer từ file tok.model và tok.vocab

In [None]:
tokenizer = PretrainedTokenizer(pretrained_model='tok.model', vocab_file='tok.vocab')

In [None]:
class InputFeatures:
  def __init__(self, input_ids: List[int], output_ids: List[int]):
    self.input_ids = input_ids
    self.output_ids = output_ids

### Biến đổi đoạn văn thô (từng dòng của file dataset) thành input có thể đưa vào mô hình

In [None]:
def convert_texts_to_features(texts, tokenizer, max_seq_len):
  pad_token_id = tokenizer.pad_token_id
  bos_token_id = tokenizer.bos_token_id
  eos_token_id = tokenizer.eos_token_id

  features = []
  for i, text in enumerate(texts):
    tokens = tokenizer.tokenize(text)

    ids = tokenizer.convert_tokens_to_ids(tokens)

    ids = [bos_token_id] + ids + [eos_token_id]

    src_ids = ids[:max_seq_len]
    tgt_ids = ids[1:max_seq_len + 1]

    padding_length = max_seq_len - len(src_ids)
    src_ids = src_ids + ([pad_token_id] * padding_length)

    padding_length = max_seq_len - len(tgt_ids)
    tgt_ids = tgt_ids + ([pad_token_id] * padding_length)

    feature = InputFeatures(input_ids=src_ids,
                            output_ids=tgt_ids)
    
    features.append(feature)

  return features

In [None]:
def create_examples(dataset_src, max_seq_len, tokenizer, mode='train', split_ratio=0.1, random_seed=42):
  random.seed(random_seed)

  dataset = []
  with open(dataset_src, 'r', encoding='utf-8') as reader:
    for line in reader.readlines():
      text = line.strip()
      if text != '':
        dataset.append(text)

  random.shuffle(dataset)

  if mode == 'train':
    dataset = dataset[:int(len(dataset)*(1-split_ratio))]
  elif mode == 'test':
    dataset = dataset[int(len(dataset)*(1-split_ratio))]

  features = convert_texts_to_features(dataset, tokenizer, max_seq_len)

  input_ids = torch.tensor([feature.input_ids for feature in features], dtype=torch.long)
  output_ids = torch.tensor([feature.output_ids for feature in features], dtype=torch.long)

  dataset = TensorDataset(input_ids, output_ids)

  return dataset

In [None]:
train_dataset = create_examples(dataset_src='./data/en-to-vi.txt', max_seq_len=max_seq_len, mode='train', tokenizer=tokenizer)

# Định nghĩa các block trong mô hình Transformer

In [None]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self, d_k):
    super(ScaledDotProductAttention, self).__init__()
    self.d_k = d_k

  def forward(self, q, k, v, attn_mask):
    attn_score = torch.matmul(q, k.transpose(-1, -2)) / np.sqrt(self.d_k)
    attn_score.masked_fill_(attn_mask, -1e9)

    attn_weights = nn.Softmax(dim=-1)(attn_score)

    output = torch.matmul(attn_weights, v)

    return output

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super(MultiHeadAttention, self).__init__()
    self.n_heads = n_heads
    self.d_k = self.d_v = d_model//n_heads

    self.WQ = nn.Linear(d_model, d_model)
    self.WK = nn.Linear(d_model, d_model)
    self.WV = nn.Linear(d_model, d_model)
    self.scaled_dot_product_attn = ScaledDotProductAttention(self.d_k)
    self.linear = nn.Linear(n_heads * self.d_v, d_model)

  def forward(self, Q, K, V, attn_mask):
    batch_size = Q.size(0)

    q_heads = self.WQ(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
    k_heads = self.WK(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
    v_heads = self.WV(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)

    attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)
    attn = self.scaled_dot_product_attn(q_heads, k_heads, v_heads, attn_mask)

    attn = attn.transpose(1, 2).contiguous().view(batch_size, -1,self.n_heads * self.d_v)
    output = self.linear(attn)

    return output

In [None]:
class PositionWiseFeedForwardNetwork(nn.Module):
  def __init__(self, d_model, d_ff):
    super(PositionWiseFeedForwardNetwork, self).__init__()

    self.linear1 = nn.Linear(d_model, d_ff)
    self.linear2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()

  def forward(self, inputs):
    output = self.relu(self.linear1(inputs))
    output = self.linear2(output)

    return output

In [None]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, n_heads, p_drop, d_ff):
    super(DecoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, n_heads)
    self.dropout1 = nn.Dropout(p_drop)
    self.layernorm1 = nn.LayerNorm(d_model, eps=1e-6)

    self.ffn = PositionWiseFeedForwardNetwork(d_model, d_ff)
    self.dropout2 = nn.Dropout(p_drop)
    self.layernorm2 = nn.LayerNorm(d_model, eps=1e-6)

  def forward(self, inputs, attn_mask):
    attn_outputs = self.mha(inputs, inputs, inputs, attn_mask)
    attn_outputs = self.dropout1(attn_outputs)
    attn_outputs = self.layernorm1(inputs + attn_outputs)

    ffn_outputs = self.ffn(attn_outputs)
    ffn_outputs = self.dropout2(ffn_outputs)
    ffn_outputs = self.layernorm2(attn_outputs + ffn_outputs)

    return ffn_outputs

In [None]:
class TransformerDecoder(nn.Module):
  def __init__(self, vocab_size, d_model, n_layers, n_heads, p_drop, d_ff, pad_id, sinusoid_table):
    super(TransformerDecoder, self).__init__()
    self.pad_id = pad_id

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_embedding = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)
    self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, p_drop, d_ff) for _ in range(n_layers)])

  def forward(self, inputs):
    positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).repeat(inputs.size(0), 1) + 1
    position_pad_mask = inputs.eq(self.pad_id)
    positions.masked_fill_(position_pad_mask, 0)

    outputs = self.embedding(inputs) + self.pos_embedding(positions)

    attn_pad_mask = self.get_attention_padding_mask(inputs, inputs, self.pad_id)
    attn_subsequent_mask = self.get_attention_subsequent_mask(inputs).to(device=attn_pad_mask.device)
    attn_mask = torch.gt((attn_pad_mask.to(dtype=attn_subsequent_mask.dtype) + attn_subsequent_mask), 0)

    for layer in self.layers:
      outputs = layer(outputs, attn_mask)

    return outputs

  def get_attention_padding_mask(self, q, k, pad_id):
    attn_pad_mask = k.eq(pad_id).unsqueeze(1).repeat(1, q.size(1), 1)

    return attn_pad_mask

  def get_attention_subsequent_mask(self, q):
    bs, q_len = q.size()
    subsequent_mask = torch.ones(bs, q_len, q_len).triu(diagonal=1)

    return subsequent_mask

In [None]:
class Transformer(nn.Module):
  def __init__(self,
               vocab_size,
               seq_len,
               d_model=512,
               n_layers=6,
               n_heads=8,
               p_drop=0.1,
               d_ff=2048,
               pad_id=0):
    super(Transformer, self).__init__()
    sinusoid_table = self.get_sinusoid_table(seq_len+1, d_model)

    self.decoder = TransformerDecoder(vocab_size, d_model, n_layers, n_heads, p_drop, d_ff, pad_id, sinusoid_table)
    self.linear = nn.Linear(d_model, vocab_size)


  def forward(self, inputs):
    decoder_outputs = self.decoder(inputs)

    outputs = self.linear(decoder_outputs)

    return outputs

  def get_sinusoid_table(self, seq_len, d_model):
    def get_angle(pos, i, d_model):
      return pos / np.power(10000, (2 * (i//2) / d_model))

    sinusoid_table = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
      for i in range(d_model):
        if i % 2 == 0:
          sinusoid_table[pos, i] = np.sin(get_angle(pos, i, d_model))
        else:
          sinusoid_table[pos, i] = np.cos(get_angle(pos, i, d_model))

    return torch.FloatTensor(sinusoid_table)

In [None]:
model = Transformer(vocab_size=vocab_size, seq_len=max_seq_len, d_model=d_model, n_layers=n_layers, n_heads=n_heads, p_drop=dropout, d_ff=d_ff)

In [None]:
if multi_gpu:
  model = nn.DataParallel(model)
model.to(device)

Transformer(
  (decoder): TransformerDecoder(
    (embedding): Embedding(10800, 384)
    (pos_embedding): Embedding(193, 384)
    (layers): ModuleList(
      (0-11): 12 x DecoderLayer(
        (mha): MultiHeadAttention(
          (WQ): Linear(in_features=384, out_features=384, bias=True)
          (WK): Linear(in_features=384, out_features=384, bias=True)
          (WV): Linear(in_features=384, out_features=384, bias=True)
          (scaled_dot_product_attn): ScaledDotProductAttention()
          (linear): Linear(in_features=384, out_features=384, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (layernorm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (ffn): PositionWiseFeedForwardNetwork(
          (linear1): Linear(in_features=384, out_features=1280, bias=True)
          (linear2): Linear(in_features=1280, out_features=384, bias=True)
          (relu): ReLU()
        )
        (dropout2): Dropout(p=0.1, inplace=False)
        (layernor

In [None]:
print(model)

In [None]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

27236400

# Tạo class optimizer

In [None]:
class ScheduledOptim:
  def __init__(self, optimizer, init_lr, d_model, n_warmup_steps=2000):
    self.optimizer = optimizer
    self.init_lr = init_lr
    self.d_model = d_model
    self.n_warmup_steps = n_warmup_steps
    self.n_steps = 0
    self.current_lr = init_lr

  def _get_lr_scale(self):
    return (self.d_model ** -0.5) * min(self.n_steps ** -0.5, self.n_steps * (self.n_warmup_steps ** -1.5))

  def update_learning_rate(self):
    self.n_steps += 1
    self.current_lr = self.init_lr * self._get_lr_scale()
    for param_group in self.optimizer.param_groups:
      param_group['lr'] = self.current_lr
    
  def state_dict(self):
    return self.optimizer.state_dict()

  def load_state_dict(self, state_dict, n_steps: int):
    self.optimizer.load_state_dict(state_dict)
    self.n_steps = n_steps
    self.update_learning_rate()

  def zero_grad(self):
    self.optimizer.zero_grad()

  def step(self):
    self.optimizer.step()
    
  def get_n_steps(self):
    return self.n_steps

  @property
  def get_current_lr(self):
    return self.current_lr 

In [None]:
optimizer = ScheduledOptim(optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), init_lr=2.0, d_model=d_model)

### Tạo class checkpoint dùng để lưu tiến trình huấn luyện, trong đó gồm có tham số đã được huấn luyện

In [None]:
class Checkpoint:
  def __init__(self, net: nn.Module, optimizer: ScheduledOptim, version: int, subdir='checkpoint'):
    self.subdir = subdir
    self.net = net
    self.optimizer = optimizer
    self.version = version
    self.path = self.get_path()

    if not os.path.exists(subdir):
      os.makedirs(subdir)

  def save(self):
    print(f'current_version:{self.version}, n_steps:{self.optimizer.get_n_steps()}')
    torch.save({
        'version': self.version,
        'model_state_dict': self.net.state_dict(),
        'optimizer_state_dict': self.optimizer.state_dict(),
        'n_steps': self.optimizer.get_n_steps()
    }, self.path)
    self.version = self.version + 1
    self.path = self.get_path()

  def get_path(self):
    return f'{self.subdir}/epoch-{self.version}.pt'

  def load(self):
    checkpoint = torch.load(self.path)
    print('n_steps:', checkpoint['n_steps'])
    print('version:', checkpoint['version'])
    self.net.load_state_dict(checkpoint['model_state_dict'])
    self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'], checkpoint['n_steps'])
    self.version = self.version + 1
    self.path = self.get_path()

In [None]:
checkpoint_version = 40

In [None]:
checkpoint = Checkpoint(net=model, optimizer=optimizer, version=checkpoint_version)

### Load checkpoint để retrain hoặc inference

In [None]:
checkpoint.load()

n_steps: 9200
version: 40


# Training

In [None]:
class Trainer:
  def __init__(self, 
               train_loader, 
               tokenizer, 
               optimizer,
               model):  
    self.train_loader = train_loader
    self.vocab_size = tokenizer.vocab_size
    self.pad_id = tokenizer.pad_token_id

    self.model = model

    self.optimizer = optimizer
    self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id)

  def train(self, epoch):
    losses = 0
    n_batches, n_samples = len(self.train_loader), len(self.train_loader.dataset)

    self.model.train()
    for i, batch in enumerate(self.train_loader):
      start_time = time.time()
      inputs, ground_truth_outputs = map(lambda x: x.to(device), batch)
      outputs = self.model(inputs)

      loss = self.criterion(outputs.view(-1, self.vocab_size), ground_truth_outputs.view(-1))
      losses += loss.item()

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.update_learning_rate()
      self.optimizer.step()
    
      end_time = time.time()


      if i % 20 == 0:
        print('Iteration {} ({}/{})\tLoss: {:.4f}\tlr: {:.4f}\tduration: {:.4f}'.format(i, i, n_batches, losses/(i + 1), self.optimizer.get_current_lr, end_time - start_time))

    checkpoint.save()
    print('Train Epoch: {}\t\tLoss: {:.4f}\tnum_steps: {}'.format(epoch, losses/n_batches, self.optimizer.get_n_steps()))

  def save(self, epoch, model_prefix='model', root='./model'):
    path = Path(root) / (model_prefix + '.ep%d' % epoch)
    if not path.parent.exists():
      path.parent.mkdir()

    torch.save(self.model, path)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
trainer = Trainer(tokenizer=tokenizer, train_loader=train_loader, optimizer=optimizer, model=model)

In [None]:
for epoch in range(1, 40 + 1):
  trainer.train(epoch)

Iteration 0 (0/231)	Loss: 0.7363	lr: 0.0011	duration: 1.2512
Iteration 20 (20/231)	Loss: 0.7491	lr: 0.0011	duration: 1.2621
Iteration 40 (40/231)	Loss: 0.7494	lr: 0.0011	duration: 1.2802
Iteration 60 (60/231)	Loss: 0.7481	lr: 0.0011	duration: 1.2400
Iteration 80 (80/231)	Loss: 0.7560	lr: 0.0011	duration: 1.2429
Iteration 100 (100/231)	Loss: 0.7676	lr: 0.0011	duration: 1.2632
Iteration 120 (120/231)	Loss: 0.7798	lr: 0.0011	duration: 1.2498
Iteration 140 (140/231)	Loss: 0.7862	lr: 0.0011	duration: 1.2495
Iteration 160 (160/231)	Loss: 0.7918	lr: 0.0011	duration: 1.2547
Iteration 180 (180/231)	Loss: 0.7973	lr: 0.0011	duration: 1.2531
Iteration 200 (200/231)	Loss: 0.8020	lr: 0.0011	duration: 1.2528
Iteration 220 (220/231)	Loss: 0.8076	lr: 0.0011	duration: 1.2516
current_version:41, n_steps:9432
Train Epoch: 1		Loss: 0.8101	num_steps: 9432


In [None]:
torch.cuda.empty_cache()

# Inference

In [None]:
model.eval()

In [None]:
def inference(text):
  text = text + ' =>'
  tokens = tokenizer.tokenize(text)
  ids = [tokenizer.bos_token_id] + tokenizer.convert_tokens_to_ids(tokens)
  ids = ids[:max_seq_len]
  input_length = len(ids)
  translate = []
  for i in range(max_seq_len - input_length):
    inputs = torch.tensor(ids).unsqueeze(0).to(device)

    outputs = model(inputs)
    output_token_id = outputs[:, -1, :].argmax(dim=-1).item()
    if output_token_id == tokenizer.eos_token_id:
      break
    else:
      ids = ids + [output_token_id]
      translate = translate + [output_token_id]

  translate = list(map(lambda id: tokenizer.ids_to_tokens[id], translate))

  translate = list(map(lambda x: x.replace('▁', ' '), translate))

  translate = ''.join(translate)

  return translate

# Xây dựng GUI cho mô hình

In [None]:
gradio.Interface(inference, "text", "text").launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://23b359f0bae0449b48.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


