# For using Cloud

Nhớ restart kernal

In [None]:
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

# Import useful library

In [2]:
from datasets import load_dataset

import configparser
import json
import torch
from torch import Tensor
import torch.nn as nn
from torch.nn import Transformer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import math

In [5]:
with open('../config.json') as f:
    config = json.load(f)

# Make dataset

In [6]:
# I'm making the fundamental translation here (eng to vi, vice versa)
print(f"Loading dataset {config['DATASET_1']}...")
print(f"Loading dataset {config['DATASET_2']}...")

Loading dataset mt_eng_vietnamese...
Loading dataset iwslt2015-en-vi...


In [8]:
# Download the dataset via the Hugging Face Datasets library
data = load_dataset(config['DATASET_1'], config['DATASET_2'])

In [9]:
# Ok so let dive in and see what the data looks like
data['train']['translation'][:5]

[{'en': 'Rachel Pike : The science behind a climate headline',
  'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'},
 {'en': 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
  'vi': 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .'},
 {'en': 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .',
  'vi': 'Tôi muốn cho các bạn biết về sự to lớn của những nỗ lực khoa học đã góp phần làm nên các dòng t

In [10]:
# Amazing, the format is a dictionary with the source and target language as keys, showcasing the translation and potential data crawling methods

In [11]:
# Tokenization using torchtext basic english tokenizer
token_transform = {}
vocab_transform = {}

# Tokenization function for the source and target language
token_transform[config['SRC_LANGUAGE']] = get_tokenizer(config['TOKENIZER'])
token_transform[config['TGT_LANGUAGE']] = get_tokenizer(config['TOKENIZER']) # Yes, we're using english tokenizer for vietnamese

In [12]:
# Specify important tokens (special symbols)
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [13]:
data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [18]:
# Build vocabulary for the source and target language
def yield_tokens(data_iter, language):
    for data_sample in data_iter['translation']:
        yield token_transform[language](data_sample[language])

for language in [config['SRC_LANGUAGE'], config['TGT_LANGUAGE']]:
    # Training data iterator
    vocab_transform[language] = build_vocab_from_iterator(yield_tokens(data['train'], language),
                                                          min_freq=1,
                                                          specials=special_symbols,
                                                          special_first=True)
    vocab_transform[language].set_default_index(config['UNK_IDX'])

In [19]:
# Create dataloader

# Modelling

## Set device

In [20]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Make the Positional Encoding for Transformer

In [21]:
class PostionalEncoding(nn.Module):
    def __init__(self, emb_dim: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PostionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_dim, 2) * math.log(10000) / emb_dim)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_dim))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)
        
    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [None]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.emb_dim = emb_dim
        
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_dim)

In [2]:
import flask

ModuleNotFoundError: No module named 'flask'