In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/somethings
%ls

Mounted at /content/drive
/content/drive/MyDrive/somethings
code.ipynb  main.ipynb  model14.pt  model15.pt  model16.pt  model17.pt  model18.pt  Untitled0.ipynb


In [2]:
!pip install datasets
# !pip install -U torchdata
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download hu_core_news_sm

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [3]:
from datasets import load_dataset
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
import os
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_ds, val_ds, test_ds=load_dataset('Helsinki-NLP/opus_books','en-hu',split=[
    'train[:80%]','train[80%:-10%]','train[-10%:]'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/137151 [00:00<?, ? examples/s]

## 1. Load data and tokenizers

In [5]:
# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [6]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

In [9]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
def get_tokenizer(ds,lang,path_tokenizer=""):
    if os.path.isfile(path_tokenizer) and path_tokenizer:
        tokenizer = Tokenizer.from_file(path_tokenizer)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        # tokenizer.save(config['tokenizer_file'])
    return tokenizer


In [10]:
tokenizer={}
tokenizer['en']=get_tokenizer(train_ds,'en')
tokenizer['hu']=get_tokenizer(train_ds,'hu')

## 2. Build Model

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self,emb_size: int,dropout: float,maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)


In [12]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

## 3. Initialize Parameters and Model

In [13]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = tokenizer['en'].get_vocab_size()
TGT_VOCAB_SIZE = tokenizer['hu'].get_vocab_size()
EMB_SIZE = 128
NHEAD = 8
FFN_HID_DIM = 128
BATCH_SIZE = 128
# BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



## 4. Train and evaluate model

In [14]:
from torch.nn.utils.rnn import pad_sequence

# # function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids.ids),
                      torch.tensor([EOS_IDX])))

# def collate_fn(batch):
#     src_batch, tgt_batch = [], []
#     for sample in batch:
#         src_batch.append(tensor_transform(tokenizer['en'].encode(sample['en'].rstrip("\n"))))
#         tgt_batch.append(tensor_transform(tokenizer['hu'].encode(sample['hu'].rstrip("\n"))))

#     src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
#     tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
#     return src_batch, tgt_batch

In [15]:
def get_ds(data_raw):
  src_batch, tgt_batch = [], []
  for sample in data_raw:
      src_batch.append(tensor_transform(tokenizer['en'].encode(sample['en'].rstrip("\n"))))
      tgt_batch.append(tensor_transform(tokenizer['hu'].encode(sample['hu'].rstrip("\n"))))


  src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)[:EMB_SIZE].T

  tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)[:EMB_SIZE].T
  return [(src_batch[i],tgt_batch[i])for i in range(len(src_batch))]
  # return src_batch, tgt_batch

In [16]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    # train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))


    # train_dataloader = DataLoader(train_ds['translation'], batch_size=BATCH_SIZE, collate_fn=collate_fn)
    train=get_ds(train_ds['translation'])
    train_dataloader = DataLoader(train, batch_size=BATCH_SIZE)

    for src, tgt in train_dataloader:
        src = src.T.to(DEVICE)
        tgt = tgt.T.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    # val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val=get_ds(val_ds['translation'])
    val_dataloader = DataLoader(val, batch_size=BATCH_SIZE)

    for src, tgt in val_dataloader:
        src = src.T.to(DEVICE)
        tgt = tgt.T.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

    PATH = "model.pt"
    torch.save({
                'epoch': epoch,
                'model_state_dict': transformer.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                }, PATH)



Epoch: 1, Train loss: 7.057, Val loss: 5.991, Epoch time = 277.689s
Epoch: 2, Train loss: 5.780, Val loss: 5.625, Epoch time = 278.805s
Epoch: 3, Train loss: 5.543, Val loss: 5.472, Epoch time = 277.346s
Epoch: 4, Train loss: 5.390, Val loss: 5.357, Epoch time = 278.992s
Epoch: 5, Train loss: 5.258, Val loss: 5.260, Epoch time = 277.604s
Epoch: 6, Train loss: 5.139, Val loss: 5.170, Epoch time = 278.485s
Epoch: 7, Train loss: 5.033, Val loss: 5.093, Epoch time = 280.428s
Epoch: 8, Train loss: 4.938, Val loss: 5.020, Epoch time = 278.119s
Epoch: 9, Train loss: 4.854, Val loss: 4.956, Epoch time = 277.898s
Epoch: 10, Train loss: 4.778, Val loss: 4.896, Epoch time = 276.853s
Epoch: 11, Train loss: 4.708, Val loss: 4.828, Epoch time = 279.725s
Epoch: 12, Train loss: 4.644, Val loss: 4.761, Epoch time = 277.566s
Epoch: 13, Train loss: 4.584, Val loss: 4.695, Epoch time = 280.325s
Epoch: 14, Train loss: 4.526, Val loss: 4.636, Epoch time = 278.452s
Epoch: 15, Train loss: 4.472, Val loss: 4.5

## 5. Test

In [17]:
checkpoint = torch.load('model.pt',map_location=torch.device('cpu'))
transformer.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [28]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    # src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    # print(tokenizer['en'].encode(src_sentence))
    src=tensor_transform(tokenizer['en'].encode(src_sentence))
    src=src.reshape(len(src),-1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()

    return tokenizer['hu'].decode(list(tgt_tokens.cpu().numpy()))
    return " ".join(tokenizer['hu'].decode(list(tgt_tokens.cpu().numpy())))

In [33]:
translate(transformer, test_ds['translation'][0]['en'])

'Mi lesz a ?'

In [37]:
transformer.eval()
losses = 0

# val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
test=get_ds(test_ds['translation'])
test_dataloader = DataLoader(test, batch_size=BATCH_SIZE)

for src, tgt in test_dataloader:
    src = src.T.to(DEVICE)
    tgt = tgt.T.to(DEVICE)

    tgt_input = tgt[:-1, :]

    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

    logits = transformer(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

    tgt_out = tgt[1:, :]
    loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
    losses += loss.item()

print("Test loss: ",losses / len(list(test_dataloader)))

Test loss:  4.547796748302601
