In [1]:
# Third party imports
import numpy as np
import pandas as pd
import torch
import seaborn as sns

from sklearn.cluster import KMeans

import src.constants as const
from src.dataset_utils import load_files, build_vocab_transformation, tokenize_source, tokenize_target
from src.transcription_dataset_single_word import TranscriptionDataset
from src.transformer_model import Seq2SeqTransformer
from src.syllable_splitter import split_word

## Model

In [2]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = 5187  # Hardcoded for now
TGT_VOCAB_SIZE = 3387  # Hardcoded for now
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(const.device)

In [3]:
transformer.load_state_dict(torch.load('models/transformer-single-word-2023-11-10-606102-25.pth'))

<All keys matched successfully>

In [4]:
vowels_transcription = ['a', 'ʌ', 'ɤ̞',  'ɐ', 'ɔ', 'o', 'u', 'ɛ', 'i']

In [5]:
words_filepath = '/mnt/d/Projects/masters-thesis/data/single_words.txt'

with open(words_filepath, 'r') as f:
    words = f.readlines()

amount_of_words = len(words)

In [6]:
# Train & test split
sentences_to_use = amount_of_words
train_split = int(const.TRAIN_TEST_SPLIT * sentences_to_use)
validation_split = int((const.TRAIN_TEST_SPLIT + const.TRAIN_VALIDATION_SPLIT) * sentences_to_use)

In [7]:
train_dataset = TranscriptionDataset(words_filepath, tokenization_src=split_word, 
                                     tokenization_tgt=lambda x: split_word(x, vowels_transcription),
                                     start_index=0, end_index=train_split)

In [8]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: list[int]):
    return torch.cat((torch.tensor([const.BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([const.EOS_IDX])))

const.vocab_transform[const.SRC_LANGUAGE] = build_vocab_transformation(train_dataset, const.SRC_LANGUAGE)

text_transform_src = sequential_transforms(tokenize_source,
    const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor


In [9]:
def get_embedding(model: Seq2SeqTransformer, word: str):
    word = word.lower()
    model.eval()
    src = text_transform_src(word).view(-1, 1).to(const.device)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    
    embedding = model.encode(src.to(const.device), src_mask.to(const.device))[1:-1]

    return embedding[:, 0, :].cpu().detach().numpy()
    # return (sum(embedding) / len(embedding))[0].cpu().detach().numpy()

In [10]:
words = pd.read_csv('/mnt/d/Projects/masters-thesis/data/words.csv', header=None, names=['word', 'transcription'])
words['word'].head()

0     аванпост
1       авиоас
2      авиобос
3    авиобранш
4     авиоград
Name: word, dtype: object

# Levenshtein distance

In [11]:
from itertools import accumulate

In [12]:
def levenshtein(word_1, word_2):
    m = len(word_1)
    n = len(word_2)

    dist = np.zeros((m + 1, n + 1))

    dist[1:, 0] = list(accumulate(word_1, func=lambda x, y: x + np.linalg.norm(y), initial=0))[1:]
    dist[0, 1:] = list(accumulate(word_2, func=lambda x, y: x + np.linalg.norm(y), initial=0))[1:]


    for j in range(1, n+1):
        for i in range(1, m+1):
            deletion_cost = np.linalg.norm(word_2[j-1])  # Needs check
            insertion_cost = np.linalg.norm(word_1[i-1])  # Needs check
            substitution_cost = 0

            distance = np.linalg.norm(word_1[i-1] - word_2[j-1])
            if distance > 10:
                substitution_cost =distance

            dist[i, j] = min(dist[i - 1, j] + deletion_cost, 
                             dist[i, j - 1] + insertion_cost, 
                             dist[i - 1, j - 1] + substitution_cost)

    # print(dist)
    return dist[len(word_1), len(word_2)]

In [13]:
def distance(word_1, word_2):
    word_1_embedding = get_embedding(transformer, word_1)
    word_2_embedding = get_embedding(transformer, word_2)

    return levenshtein(word_1_embedding, word_2_embedding)

In [19]:
words_only = words.drop('transcription', axis=1).sample(10000)

# words_only.merge(words_only, how='cross')
cross = words_only.merge(words_only, how='cross', suffixes=['_1', '_2'])

In [20]:
cross['distance'] = cross.apply(lambda x: distance(x['word_1'], x['word_2']), axis=1)

KeyboardInterrupt: 

In [None]:
cross = cross[cross['distance'] != 0.0]

In [None]:
from collections import defaultdict

In [None]:
result = defaultdict(float)

for item in cross.iloc:
    result[item['word_1']] += item['distance']

result

defaultdict(float, {'мимоза': 425.3814010620117})