In [1]:
# Standard library imports
import json
import os

from copy import deepcopy
from typing import Iterable, List

# Third party imports
import pandas as pd
import torch
import torch.nn as nn
import tqdm

from nltk.tokenize import word_tokenize

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k

# Local application imports
import src.constants as const
from src.dataset_utils import yield_tokens, sentence_to_tensor, load_files, build_vocab_transformation, tokenize_source, tokenize_target
from src.training_utils import train_epoch, evaluate, sequential_transforms, tensor_transform
from src.transcription_dataset_single_word import TranscriptionDataset
from src.transformer_model import Seq2SeqTransformer, generate_square_subsequent_mask, create_mask
from src.syllable_splitter import split_word

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Dataset stuff

In [3]:
words_filepath = '/mnt/d/Projects/masters-thesis/data/single_words.txt'

with open(words_filepath, 'r') as f:
    words = f.readlines()
amount_of_words = len(words)

f'{amount_of_words} words loaded'

'606102 words loaded'

In [4]:
vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']

In [5]:
# Train & test split
sentences_to_use = amount_of_words
train_split = int(const.TRAIN_TEST_SPLIT * sentences_to_use)
validation_split = int((const.TRAIN_TEST_SPLIT + const.TRAIN_VALIDATION_SPLIT) * sentences_to_use)

In [6]:
train_dataset = TranscriptionDataset(words_filepath, tokenization_src=split_word, 
                                     tokenization_tgt=lambda x: split_word(x, vowels_transcription),
                                     start_index=0, end_index=train_split)
validation_dataset = TranscriptionDataset(words_filepath, tokenization_src=split_word,
                                          tokenization_tgt=lambda x: split_word(x, vowels_transcription),
                                          start_index=train_split, end_index=validation_split)
test_dataset = TranscriptionDataset(words_filepath, tokenization_src=split_word,
                                    tokenization_tgt=lambda x: split_word(x, vowels_transcription),
                                    start_index=validation_split, end_index=amount_of_words)

In [7]:
for ln in [const.SRC_LANGUAGE, const.TGT_LANGUAGE]:
    # Create torchtext's Vocab object
    const.vocab_transform[ln] = build_vocab_transformation(train_dataset, ln)

In [8]:
text_transform_src = sequential_transforms(const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']
text_transform_tgt = sequential_transforms(const.vocab_transform[const.TGT_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor


In [9]:
# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform_src(src_sample))
        tgt_batch.append(text_transform_tgt(tgt_sample))

    src_batch = pad_sequence(src_batch, padding_value=const.PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=const.PAD_IDX)
    return src_batch, tgt_batch

In [10]:
train_dataloader = DataLoader(deepcopy(train_dataset), batch_size=128, collate_fn=collate_fn)
validation_dataloader = DataLoader(deepcopy(validation_dataset), batch_size=128, collate_fn=collate_fn)
test_dataloader = DataLoader(deepcopy(test_dataset), batch_size=128, collate_fn=collate_fn)

In [11]:
for i in range(128):
    print(i)
    print(train_dataset[3788 * 128 + i])

# print(3788 * 128 + 17)
# print(train_dataset[3788 * 128 + 17])


0
(['тя'], ['tjɐ'])
1
(['го'], ['go'])
2
(['раз', 'би', 'ра', 'ше'], ['rʌz', 'bi', 'rʌ', 'ʃɛ'])
3
(['на', 'пъл', 'но'], ['nʌ', 'pɐl', 'no'])
4
(['а'], ['ʌ'])
5
(['ар', 'ган', 'те'], ['ʌr', 'gʌn', 'tɛ'])
6
(['раз', 'би', 'ра', 'ше'], ['rʌz', 'bi', 'rʌ', 'ʃɛ'])
7
(['ли'], ['li'])
8
(['го'], ['go'])
9
(['о'], ['o'])
10
(['не'], ['nɛ'])
11
(['тя'], ['tjɐ'])
12
(['мо', 'же', 'ше'], ['mo', 'ʒɛ', 'ʃɛ'])
13
(['ли'], ['li'])
14
(['да'], ['dʌ'])
15
(['че', 'те'], ['tʃɛ', 'tɛ'])
16
(['не'], ['nɛ'])
17


StopIteration: 

# Model stuff

In [12]:

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(const.vocab_transform[const.SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(const.vocab_transform[const.TGT_LANGUAGE])

print(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE)

transformer = Seq2SeqTransformer(const.NUM_ENCODER_LAYERS, const.NUM_DECODER_LAYERS, const.EMB_SIZE,
                                 const.NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, const.FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(const.device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=const.PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


5187 3387


In [13]:
from timeit import default_timer as timer
NUM_EPOCHS = 25

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, train_dataloader, loss_fn)
    end_time = timer()
    val_loss = evaluate(transformer, validation_dataloader, loss_fn)
    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s")

Epoch: 1, Train loss: 0.525, Val loss: 0.102, Epoch time = 148.193s
Epoch: 2, Train loss: 0.082, Val loss: 0.055, Epoch time = 148.255s
Epoch: 3, Train loss: 0.048, Val loss: 0.043, Epoch time = 150.409s
Epoch: 4, Train loss: 0.036, Val loss: 0.037, Epoch time = 151.214s
Epoch: 5, Train loss: 0.029, Val loss: 0.034, Epoch time = 147.975s
Epoch: 6, Train loss: 0.025, Val loss: 0.032, Epoch time = 149.912s
Epoch: 7, Train loss: 0.022, Val loss: 0.030, Epoch time = 149.436s
Epoch: 8, Train loss: 0.020, Val loss: 0.030, Epoch time = 157.480s
Epoch: 9, Train loss: 0.018, Val loss: 0.028, Epoch time = 157.681s
Epoch: 10, Train loss: 0.017, Val loss: 0.027, Epoch time = 150.344s
Epoch: 11, Train loss: 0.016, Val loss: 0.026, Epoch time = 148.412s
Epoch: 12, Train loss: 0.015, Val loss: 0.027, Epoch time = 151.200s
Epoch: 13, Train loss: 0.015, Val loss: 0.026, Epoch time = 150.477s
Epoch: 14, Train loss: 0.014, Val loss: 0.026, Epoch time = 148.404s
Epoch: 15, Train loss: 0.013, Val loss: 0.0

In [15]:
from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

torch.save(transformer.state_dict(), f'models/transformer-single-word-{today_date}-{sentences_to_use}-{NUM_EPOCHS}.pth')

In [13]:
transformer.load_state_dict(torch.load('models/transformer-single-word-2023-11-10-606102-25.pth'))

<All keys matched successfully>

In [14]:
text_transform_src = sequential_transforms(tokenize_source,
    const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']
text_transform_tgt = sequential_transforms(const.vocab_transform[const.TGT_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor

In [15]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(const.device)
    src_mask = src_mask.to(const.device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(const.device)
    for i in range(max_len-1):
        memory = memory.to(const.device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(const.device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == const.EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    src_sentence = src_sentence.lower()
    model.eval()
    src = text_transform_src(src_sentence).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=const.BOS_IDX).flatten()
    # print(list(tgt_tokens.cpu().numpy()))
    return " ".join(const.vocab_transform[const.TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [18]:
bulgarian_words = [
    'здравей',
    'благодаря',
    'лято',
    'зима',
    'книга',
    'кафе',
    'синьо',
    'часовник',
    'градина',
    'слънце',
    'месец',
    'живот',
    'река',
    'музика',
    'храна'
]

for word in bulgarian_words:
    print(translate(transformer, word))

 zdrʌ vɛj 
 blʌ go dʌr jɐ 
 ljɐ to 
 zi mʌ 
 kni gʌ 
 kʌ fɛ 
 sin jo 
 tʃʌ jɐ jɐs nik 
 grʌ di nʌ 
 slɐn tsɛ 
 mɛ sɛts 
 ʒi vot 
 rɛ kʌ 
 mo zi kʌ 
 xrʌ nʌ 


In [18]:
translate(transformer, 'попо')

' po po '

In [19]:
translate(transformer, 'по')

' po '