In [1]:
# Standard library imports
import json
import os

from copy import deepcopy
from typing import Iterable, List

# Third party imports
import pandas as pd
import torch
import tqdm

from nltk.tokenize import word_tokenize

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k

# Local application imports
import src.constants as const
from src.dataset_utils import yield_tokens, sentence_to_tensor, load_files, build_vocab_transformation, tokenize_source, tokenize_target
from src.transcription_dataset import TranscriptionDataset
from src.syllable_splitter import split_word

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Dataset stuff

In [3]:
# Find all the files
files = load_files('/mnt/d/Projects/masters-thesis/data/transcriptions')

filepaths_to_size = pd.read_csv('/mnt/d/Projects/masters-thesis/data/filepath_to_size.csv')
lines_count = filepaths_to_size['size'].sum()

f'Брой файлове: {len(files)}, брой редове: {lines_count}'

'Брой файлове: 47825, брой редове: 80615534'

In [4]:
# Train & test split
sentences_to_use = 50000
train_split = int(const.TRAIN_TEST_SPLIT * sentences_to_use)
validation_split = int((const.TRAIN_TEST_SPLIT + const.TRAIN_VALIDATION_SPLIT) * sentences_to_use)

In [5]:
train_dataset = TranscriptionDataset(files, tokenization_src=tokenize_source, tokenization_tgt=tokenize_target,
                                     start_index=0, end_index=train_split)
validation_dataset = TranscriptionDataset(files, tokenization_src=tokenize_source, tokenization_tgt=tokenize_target,
                                          start_index=train_split, end_index=validation_split)
test_dataset = TranscriptionDataset(files, tokenization_src=tokenize_source, tokenization_tgt=tokenize_target,
                                    start_index=validation_split, end_index=sentences_to_use)

In [6]:
for ln in [const.SRC_LANGUAGE, const.TGT_LANGUAGE]:
    # Create torchtext's Vocab object
    const.vocab_transform[ln] = build_vocab_transformation(train_dataset, ln)

Takes 229 minutes

In [7]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([const.BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([const.EOS_IDX])))


text_transform_src = sequential_transforms(const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']
text_transform_tgt = sequential_transforms(const.vocab_transform[const.TGT_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform_src(src_sample))
        tgt_batch.append(text_transform_tgt(tgt_sample))

    src_batch = pad_sequence(src_batch, padding_value=const.PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=const.PAD_IDX)
    return src_batch, tgt_batch

In [8]:
train_dataloader = DataLoader(deepcopy(train_dataset), batch_size=128, collate_fn=collate_fn)
validation_dataloader = DataLoader(deepcopy(validation_dataset), batch_size=128, collate_fn=collate_fn)
test_dataloader = DataLoader(deepcopy(test_dataset), batch_size=128, collate_fn=collate_fn)

# Model stuff

In [9]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [10]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=const.device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=const.device).type(torch.bool)

    src_padding_mask = (src == const.PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == const.PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [11]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(const.vocab_transform[const.SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(const.vocab_transform[const.TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(const.device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=const.PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [12]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer, train_dataloader):
    model.train()
    losses = 0

    for src, tgt in train_dataloader:
        src = src.to(const.device)
        tgt = tgt.to(const.device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    for src, tgt in validation_dataloader:
        src = src.to(const.device)
        tgt = tgt.to(const.device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(validation_dataloader))

In [26]:
from timeit import default_timer as timer
NUM_EPOCHS = 25

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, train_dataloader)
    end_time = timer()
    val_loss = evaluate(transformer)
    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s")

Epoch: 1, Train loss: 4.714, Val loss: 2.977, Epoch time = 531.922s
Epoch: 2, Train loss: 2.111, Val loss: 1.263, Epoch time = 522.123s
Epoch: 3, Train loss: 0.973, Val loss: 0.654, Epoch time = 519.233s
Epoch: 4, Train loss: 0.531, Val loss: 0.399, Epoch time = 519.653s
Epoch: 5, Train loss: 0.343, Val loss: 0.290, Epoch time = 513.225s
Epoch: 6, Train loss: 0.240, Val loss: 0.207, Epoch time = 513.179s
Epoch: 7, Train loss: 0.184, Val loss: 0.161, Epoch time = 511.824s
Epoch: 8, Train loss: 0.137, Val loss: 0.159, Epoch time = 511.727s
Epoch: 9, Train loss: 0.110, Val loss: 0.103, Epoch time = 511.984s
Epoch: 10, Train loss: 0.088, Val loss: 0.091, Epoch time = 511.831s
Epoch: 11, Train loss: 0.074, Val loss: 0.115, Epoch time = 511.824s
Epoch: 12, Train loss: 0.060, Val loss: 0.115, Epoch time = 511.754s
Epoch: 13, Train loss: 0.050, Val loss: 0.071, Epoch time = 511.510s
Epoch: 14, Train loss: 0.041, Val loss: 0.060, Epoch time = 511.865s
Epoch: 15, Train loss: 0.034, Val loss: 0.0

In [13]:
from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

torch.save(transformer.state_dict(), f'models/transformer-{today_date}-{sentences_to_use}-{NUM_EPOCHS}.pth')

NameError: name 'NUM_EPOCHS' is not defined

In [14]:
transformer.load_state_dict(torch.load('models/transformer-2023-10-08-50000-25.pth'))

<All keys matched successfully>

In [15]:
text_transform_src = sequential_transforms(tokenize_source,
    const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']
text_transform_tgt = sequential_transforms(const.vocab_transform[const.TGT_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

In [16]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(const.device)
    src_mask = src_mask.to(const.device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(const.device)
    for i in range(max_len-1):
        memory = memory.to(const.device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(const.device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == const.EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    src_sentence = src_sentence.lower()
    model.eval()
    src = text_transform_src(src_sentence).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=const.BOS_IDX).flatten()
    # print(list(tgt_tokens.cpu().numpy()))
    return " ".join(const.vocab_transform[const.TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [31]:
def get_embedding(model: Seq2SeqTransformer, src_sentence: str):
    src_sentence = src_sentence.lower()
    model.eval()
    src = text_transform_src(src_sentence).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    
    embedding = model.encode(src.to(const.device), src_mask.to(const.device))

    return (sum(embedding) / len(embedding))[0].cpu().detach().numpy()

In [32]:
embedding = get_embedding(transformer, "здравей")

print(embedding.shape)

embedding

(512,)


array([-1.86730608e-01, -2.35656172e-01,  7.73406446e-01, -7.14438498e-01,
        5.65402925e-01, -4.64507461e-01,  1.44616234e+00, -4.89935637e-01,
        1.72372341e-01,  2.52599269e-01, -9.74091142e-03, -8.65800440e-01,
        1.16328061e+00,  4.54129070e-01, -5.86373731e-02, -1.45054221e-01,
        1.09022903e+00,  1.32008195e+00,  9.76748645e-01,  1.06411362e+00,
       -4.79722694e-02, -6.20427608e-01, -6.62277937e-01, -3.72619331e-01,
       -1.25096798e-01,  2.52738625e-01, -1.01383552e-01,  5.44131875e-01,
        4.70921546e-01,  8.83736730e-01,  6.61719263e-01,  2.00718999e-01,
        3.51873428e-01,  6.74048126e-01,  1.30980122e+00,  2.00965953e+00,
        9.68718946e-01,  1.01932120e+00, -7.86230922e-01,  1.60880029e+00,
        8.97166133e-01,  2.54523730e+00, -7.75524676e-01,  1.23724604e+00,
       -2.24674940e-02,  4.63824689e-01,  5.52128255e-03,  1.89576530e+00,
       -1.31445563e+00,  8.89696121e-01, -1.07214665e+00,  1.32594407e+00,
       -4.94223237e-01,  

In [19]:
print(translate(transformer, "здравей, как сте?"))
print(translate(transformer, "Добре съм, благодаря."))
print(translate(transformer, "Айрян"))
print(translate(transformer, "Български език"))
print(translate(transformer, "език Български език"))
print(translate(transformer, "Лятото е моето любимо време на годината."))
print(translate(transformer, "В парка разцъфтяха невероятни цветя."))
print(translate(transformer, "Музиката успокоява душата ми след дълъг работен ден."))
print(translate(transformer, "Вчера се срещнах със стар приятел, когото не бях виждал години."))
print(translate(transformer, "Четенето на книги разширява хоризонтите и обогатява речника."))

 zdrʌ vɛj , kʌk stɛ ? 
 dob rɛ sɐm , blʌ go dʌr jɐ . 
 xvɐr ljɐ 
 bɐl gʌr ski ɛ zik 
 ɛ zik bɐl gʌr ski ɛ zik 
 ljɐ to to ɛ mo ɛ to ljo bi mo vrɛ mɛ nʌ go di nʌ tʌ . 
 v pʌr kʌ rʌz lit tjɐ xʌ nɛ vɛ ro jɐt ni tsvɛt jɐ . 
 mo zi kʌ tʌ os po ko jɐ vʌ do ʃʌ tʌ mi slɛd dɐ lɐg rʌ bo tɛn dɛn . 
 vtʃɛ rʌ sɛ srɛʃ tnʌx sɐs stʌr pri jɐ tɛl , ko go to nɛ bjɐx viʒ dʌl go di ni . 
 tʃɛ tɛ nɛ to nʌ kni gi rʌz ʃir jɐ vʌ xo ri zon ti tɛ i o bo gʌt jɐ vʌ rɛt ʃni kʌ . 
