# For using Cloud

Nhớ restart kernal

In [1]:
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

^C


# Import useful library

In [44]:
from datasets import load_dataset

In [45]:
data = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('basic_english')
token_transform[TGT_LANGUAGE] = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

def yield_tokens(data_iter, lang):
    for data_sample in data_iter['translation']:
        yield token_transform[lang](data_sample[lang])
        
for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator 
    vocab_transform[lang] = build_vocab_from_iterator(yield_tokens(data['train'], lang),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
    
    vocab_transform[lang].set_default_index(UNK_IDX)
    

In [59]:
import torch
from torch.nn.utils.rnn import pad_sequence

def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: list):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))
    
text_transform = {}
for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[lang] = sequential_transforms(token_transform[lang], #Tokenization
                                                 vocab_transform[lang], #Numericalization
                                                 tensor_transform) # Add BOS/EOS and create tensor
    
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sample in batch:
        src_sample, tgt_sample = sample[SRC_LANGUAGE], sample[TGT_LANGUAGE]
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample)).to(dtype=torch.int64)
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample)).to(dtype = torch.int64)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

In [60]:
from torch.utils.data import DataLoader
BATCH_SIZE = 128
train_dataloader = DataLoader(data['train']['translation'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_dataloader  = DataLoader(data['validation']['translation'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(data['test']['translation'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)  

In [61]:
from torch import Tensor
import torch.nn as nn
import torch
from torch.nn import Transformer
import math

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [62]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [63]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [64]:
# Translation model:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, encoder_layer: int, decoder_layer: int,
                 emb_dim: int, n_head: int, 
                 src_vocab_size: int,
                 tgt_vocab_size: int,  
                 d_ffn: int, dropout: float
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.positional_encoding = PositionalEncoding(emb_dim, dropout)
        self.src_token_embedding = TokenEmbedding(src_vocab_size, emb_dim)
        self.tgt_token_embedding = TokenEmbedding(tgt_vocab_size, emb_dim)
        self.Transformer = Transformer(d_model=emb_dim, nhead=n_head,
                                       num_encoder_layers=encoder_layer,
                                       num_decoder_layers=decoder_layer,
                                       dim_feedforward=d_ffn, dropout=dropout,
                                       batch_first=True)
        self.generator = nn.Linear(emb_dim, tgt_vocab_size)
        

    def count_param(self):
            return sum(p.numel() for p in self.Transformer.parameters())
    
    def encode(self, src: Tensor, src_mask: Tensor):
        return self.Transformer.encoder(self.positional_encoding
                                        (self.src_token_embedding(src)), src_mask
                                        )
    
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.Transformer.decoder(self.positional_encoding
                                        (self.tgt_token_embedding(tgt)), memory, tgt_mask
                                        )
    
    def forward(self, 
                src: Tensor, 
                tgt: Tensor,
                src_mask: Tensor, tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor) -> Tensor:
        
        src_emb = self.positional_encoding(self.src_token_embedding(src))
        tgt_emb = self.positional_encoding(self.tgt_token_embedding(tgt))
        
        outs = self.Transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        
        return self.generator(outs)

In [65]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX)
    tgt_padding_mask = (tgt == PAD_IDX)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [66]:
import time
device = DEVICE

def train_epoch(model, optimizer, criterion, train_dataloader, device):
    model.train()
    losses = []
    
    for src_ids, tgt_ids in train_dataloader:
        src_ids, tgt_ids = src_ids.to(device), tgt_ids.to(device)
        tgt_input = tgt_ids[:,:-1]
        tgt_output = tgt_ids[:,1:]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_ids, tgt_input)
        try:
            output = model(src_ids, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
        except:
            print(src_ids.shape, tgt_input.shape)
        
        optimizer.zero_grad()
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()
        
        optimizer.step()
        losses.append(loss.item())
        
        return sum(losses)/len(losses) 
    
        

In [67]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for src_ids, tgt_ids in data_loader:
            src_ids, tgt_ids = src_ids.to(device), tgt_ids.to(device)
            tgt_input = tgt_ids[:,:-1]
            tgt_output = tgt_ids[:,1:]
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_ids, tgt_input)
            output = model(src_ids, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
            losses.append(loss.item())
        return sum(losses)/len(losses)

In [68]:
def train(model, train_dataloader, valid_dataloader, optimizer, criterion, device, epochs):
    for epoch in range(1, epochs+1):
        start_time = time.time()
        train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device)
        end_time = time.time()
        val_loss = evaluate(model, valid_dataloader, criterion, device)
        print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")

In [69]:
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 5
NUM_DECODER_LAYERS = 5

In [70]:
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD,SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM, 0.1).to(DEVICE)

In [71]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
epochs = 10
train(transformer, train_dataloader, valid_dataloader, optimizer, criterion, DEVICE, epochs)

AttributeError: 'NoneType' object has no attribute 'to'

# Make dataset

In [None]:
# I'm making the fundamental translation here (eng to vi, vice versa)
print(f"Loading dataset {config['DATASET_1']}...")
print(f"Loading dataset {config['DATASET_2']}...")

NameError: name 'config' is not defined

In [None]:
# Download the dataset via the Hugging Face Datasets library
data = load_dataset(config['DATASET_1'], config['DATASET_2'])

In [None]:
# Ok so let dive in and see what the data looks like
data['train']['translation'][:5]

[{'en': 'Rachel Pike : The science behind a climate headline',
  'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'},
 {'en': 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
  'vi': 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .'},
 {'en': 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .',
  'vi': 'Tôi muốn cho các bạn biết về sự to lớn của những nỗ lực khoa học đã góp phần làm nên các dòng t

In [None]:
# Amazing, the format is a dictionary with the source and target language as keys, showcasing the translation and potential data crawling methods

In [None]:
# Tokenization using torchtext basic english tokenizer
token_transform = {}
vocab_transform = {}

# Tokenization function for the source and target language
token_transform[config['SRC_LANGUAGE']] = get_tokenizer(config['TOKENIZER'])
token_transform[config['TGT_LANGUAGE']] = get_tokenizer(config['TOKENIZER']) # Yes, we're using english tokenizer for vietnamese

In [None]:
# Specify important tokens (special symbols)
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [None]:
# Build vocabulary for the source and target language
def yield_tokens(data_iter, language):
    for data_sample in data_iter['translation']:
        yield token_transform[language](data_sample[language])

for language in [config['SRC_LANGUAGE'], config['TGT_LANGUAGE']]:
    # Training data iterator
    vocab_transform[language] = build_vocab_from_iterator(yield_tokens(data['train'], language),
                                                          min_freq=1,
                                                          specials=special_symbols,
                                                          special_first=True)
    vocab_transform[language].set_default_index(config['UNK_IDX'])

In [None]:
# Create dataloader

# Modelling

## Set device

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Make the Positional Encoding for Transformer

In [None]:
class PostionalEncoding(nn.Module):
    def __init__(self, emb_dim: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PostionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_dim, 2) * math.log(10000) / emb_dim)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_dim))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)
        
    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [None]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.emb_dim = emb_dim
        
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_dim)