In [22]:
# Third party imports
import numpy as np
import pandas as pd
import torch
from sklearn.cluster import KMeans

import src.constants as const
from src.dataset_utils import yield_tokens, sentence_to_tensor, load_files, build_vocab_transformation, tokenize_source, tokenize_target
from src.transcription_dataset import TranscriptionDataset
from src.transformer_model import Seq2SeqTransformer

## Model

In [2]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = 5201 # Hardcoded for now
TGT_VOCAB_SIZE = 3398 # Hardcoded for now
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(const.device)

In [3]:
transformer.load_state_dict(torch.load('models/transformer-2023-10-08-50000-25.pth'))

<All keys matched successfully>

In [4]:
# Train & test split
sentences_to_use = 50000
train_split = int(const.TRAIN_TEST_SPLIT * sentences_to_use)
validation_split = int((const.TRAIN_TEST_SPLIT + const.TRAIN_VALIDATION_SPLIT) * sentences_to_use)

In [5]:
files = load_files('/mnt/d/Projects/masters-thesis/data/transcriptions')

In [6]:
train_dataset = TranscriptionDataset(files, tokenization_src=tokenize_source, tokenization_tgt=tokenize_target,
                                     start_index=0, end_index=train_split)

In [7]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: list[int]):
    return torch.cat((torch.tensor([const.BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([const.EOS_IDX])))

const.vocab_transform[const.SRC_LANGUAGE] = build_vocab_transformation(train_dataset, const.SRC_LANGUAGE)

text_transform_src = sequential_transforms(tokenize_source,
    const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor


In [8]:
def get_embedding(model: Seq2SeqTransformer, word: str):
    word = word.lower()
    model.eval()
    src = text_transform_src(word).view(-1, 1).to(const.device)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    
    embedding = model.encode(src.to(const.device), src_mask.to(const.device))

    return (sum(embedding) / len(embedding))[0].cpu().detach().numpy()

## Clustering

In [9]:
# print(get_embedding(transformer, "здравей"))

In [10]:
words = pd.read_csv('/mnt/d/Projects/masters-thesis/data/words.csv', header=None, names=['word', 'transcription'])
words['word'].head()

0     аванпост
1       авиоас
2      авиобос
3    авиобранш
4     авиоград
Name: word, dtype: object

In [11]:
X = words['word'].apply(lambda x: get_embedding(transformer, x))
X = X.to_numpy()
X = np.vstack(X)
X.shape

In [77]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(X)

In [78]:
distances = kmeans.transform(X)

In [79]:
words['distance'] = distances

In [80]:
words.head()

Unnamed: 0,word,transcription,distance
0,аванпост,ава`нпост,6.986641
1,авиоас,а`виоа`с,7.593191
2,авиобос,а`виобо`с,7.208892
3,авиобранш,а`виобра`нш,6.797083
4,авиоград,а`виогра`д,6.9884


In [81]:
words.sort_values(by=['distance'], ascending=False).head(10)

Unnamed: 0,word,transcription,distance
105924,я,я,14.381339
105758,я,я,14.381339
43569,сексекспериментаторче,сексекспериментаторче,14.258535
116608,аутогонистофилия,аутогонистофилия,14.018292
81423,геологопроучвател,геологопроучвател,14.000455
11911,хиперкомуникативен,хиперкомуникативен,13.99406
8733,предприсъединителен,предприсъединителен,13.980311
61622,видеообаждане,видеообаждане,13.946628
58035,сексекспериментатор,сексекспериментатор,13.932568
78718,минисъоръжение,минисъоръжение,13.899301


In [82]:
words.sort_values(by=['distance'], ascending=False).tail(10)

Unnamed: 0,word,transcription,distance
23049,шетачка,шета`чка,5.035385
18662,омбрелка,омбрелка,5.029744
15938,извадка,изва`дка,5.012739
17263,лежачка,лежа`чка,4.996249
17311,лещарка,леща`рка,4.996249
108193,Лещарка,Леща`рка,4.996249
20258,ракийка,ракийка,4.992129
21172,спагетка,спагетка,4.942128
13285,армийка,армийка,4.9268
72728,устничка,устничка,4.908775


## Distance calculations

In [70]:
def calcualte_distance(word: str) -> float:
    embedding = get_embedding(transformer, word)
    distance = kmeans.transform([embedding])

    return distance[0][0]

In [83]:
calcualte_distance("здравей")

8.852234863572615

In [84]:
calcualte_distance("атака")

7.022389322232049

In [85]:
calcualte_distance("мерси")

8.354975015268934