In [88]:
# Third party imports
import numpy as np
import pandas as pd
import torch
import seaborn as sns

from sklearn.cluster import KMeans

import src.constants as const
from src.dataset_utils import yield_tokens, sentence_to_tensor, load_files, build_vocab_transformation, tokenize_source, tokenize_target
from src.transcription_dataset import TranscriptionDataset
from src.transformer_model import Seq2SeqTransformer

## Model

In [3]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = 5201 # Hardcoded for now
TGT_VOCAB_SIZE = 3398 # Hardcoded for now
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(const.device)

In [4]:
transformer.load_state_dict(torch.load('models/transformer-2023-10-08-50000-25.pth'))

<All keys matched successfully>

In [5]:
# Train & test split
sentences_to_use = 50000
train_split = int(const.TRAIN_TEST_SPLIT * sentences_to_use)
validation_split = int((const.TRAIN_TEST_SPLIT + const.TRAIN_VALIDATION_SPLIT) * sentences_to_use)

In [6]:
files = load_files('/mnt/d/Projects/masters-thesis/data/transcriptions')

In [7]:
train_dataset = TranscriptionDataset(files, tokenization_src=tokenize_source, tokenization_tgt=tokenize_target,
                                     start_index=0, end_index=train_split)

In [8]:
# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: list[int]):
    return torch.cat((torch.tensor([const.BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([const.EOS_IDX])))

const.vocab_transform[const.SRC_LANGUAGE] = build_vocab_transformation(train_dataset, const.SRC_LANGUAGE)

text_transform_src = sequential_transforms(tokenize_source,
    const.vocab_transform[const.SRC_LANGUAGE], #Numericalization
                                                tensor_transform) # Add BOS/EOS and create tensor


In [35]:
def average_vectors(embedding):
    return (sum(embedding) / len(embedding))[0].cpu().detach().numpy()

In [146]:
def get_embedding(model: Seq2SeqTransformer, word: str, sum_function=average_vectors):
    word = word.lower()
    model.eval()
    src = text_transform_src(word).view(-1, 1).to(const.device)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    
    embedding = model.encode(src.to(const.device), src_mask.to(const.device))[1:-1]

    return sum_function(embedding)
    # return (sum(embedding) / len(embedding))[0].cpu().detach().numpy()

## Clustering

In [10]:
# print(get_embedding(transformer, "здравей"))

In [11]:
words = pd.read_csv('/mnt/d/Projects/masters-thesis/data/words.csv', header=None, names=['word', 'transcription'])
words['word'].head()

0     аванпост
1       авиоас
2      авиобос
3    авиобранш
4     авиоград
Name: word, dtype: object

In [12]:
X = words['word'].apply(lambda x: get_embedding(transformer, x))
X = X.to_numpy()
X = np.vstack(X)
X.shape

(119678, 512)

In [13]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(X)

In [14]:
distances = kmeans.transform(X)

In [15]:
words['distance'] = distances

In [16]:
words.head()

Unnamed: 0,word,transcription,distance
0,аванпост,ава`нпост,7.611586
1,авиоас,а`виоа`с,6.65547
2,авиобос,а`виобо`с,6.287144
3,авиобранш,а`виобра`нш,5.829149
4,авиоград,а`виогра`д,5.926437


In [17]:
words.sort_values(by=['distance'], ascending=False).head(10)

Unnamed: 0,word,transcription,distance
105758,я,я,14.999984
105924,я,я,14.999984
96051,яздя,я`здя,14.545139
87,бич,бич,14.403899
84627,тяло,тя`ло,14.283451
96683,кънтя,кънтя`,14.204163
165,вид,вид,14.19148
272,дек,дек,14.166687
96039,целя,це`ля,14.131298
96792,целя,це`ля,14.131298


In [18]:
words.sort_values(by=['distance'], ascending=False).tail(10)

Unnamed: 0,word,transcription,distance
61413,бибипкане,бибипкане,5.262395
45915,ахилесов,ахиле`сов,5.255245
18557,обаятелка,обаятелка,5.240114
16608,кинофраза,кинофраза,5.226163
36040,файлообмен,файлообмен,5.178284
16054,индустрийка,индустрийка,5.170302
63945,каталясване,каталясване,5.125273
61388,баталясване,баталясване,5.12201
88551,изкалъпвам,изкалъ`пвам,5.112473
73357,двусмисленост,двусми`сленост,5.071452


## Distance calculations

In [19]:
def calcualte_distance(word: str) -> float:
    embedding = get_embedding(transformer, word)
    distance = kmeans.transform([embedding])

    return distance[0][0]

In [20]:
calcualte_distance("здравей")

10.205337111822425

In [21]:
calcualte_distance("атака")

7.44488404470796

In [22]:
calcualte_distance("мерси")

9.650738880690614

## Playground

In [169]:
a = np.array([1, 2, 3])

np.pad(a, (0, 5 - len(a)))

array([1, 2, 3, 0, 0])

In [170]:
def concat_embedding(embedding):
    embedding = embedding.cpu().detach().numpy()
    embedding_list = [embedding[i][0] for i in range(embedding.shape[0])]
    embedding_concat = np.concatenate(embedding_list)
    embedding_padded = np.pad(embedding_concat, (0, 7*512 - len(embedding_concat)), mode='constant', constant_values=[0])

    return embedding_padded

In [183]:
word1 = 'покана'
word2 = 'кана'
word3 = 'допира'

word1_embedding = get_embedding(transformer, word1, concat_embedding)
word2_embedding = get_embedding(transformer, word2, concat_embedding)
word3_embedding = get_embedding(transformer, word3, concat_embedding)

In [172]:
word1_embedding.shape

(3584,)

In [184]:
np.linalg.norm(word1_embedding - word2_embedding)

34.188503

In [185]:
np.linalg.norm(word2_embedding - word3_embedding)

36.33621

In [186]:
np.linalg.norm(word1_embedding - word3_embedding)

30.643633