In [1]:
import pandas as pd
import tqdm

from src.distance import distance, levenshtein, levenshtein_optimized
from src.word_to_embedding import WordToEmbedding

In [13]:
words = pd.read_csv('/mnt/d/Projects/masters-thesis/data/words_with_embeddings.csv')
words.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'word', 'embedding'], dtype='object')

In [15]:
words_10k = words.sample(25_000)
del words_10k['Unnamed: 0.1']
del words_10k['Unnamed: 0']
del words_10k['embedding']
words_10k.head()

Unnamed: 0,word
96804,боклуча
28883,овчарство
102485,проскубя
54289,синхронизиран
89233,лискам


In [16]:
words_10k.to_csv('/mnt/d/Projects/masters-thesis/data/words_25k.csv', index=False)

# Levenshtein distance

In [8]:
w2e = WordToEmbedding()

In [17]:
words_10k['embedding'] = words_10k['word'].apply(lambda x: w2e.get_embedding(x))

In [18]:
import numpy as np
results = {}

for word_1 in tqdm.tqdm(words_10k.iloc()):
    current_sum = 0
    current_count = 0
    word_1_distances = [np.linalg.norm(x) for x in word_1['embedding']]

    for word_2 in tqdm.tqdm(words_10k.iloc()):
        word_2_distances = [np.linalg.norm(x) for x in word_2['embedding']]

        current_sum += levenshtein_optimized(word_1['embedding'], word_2['embedding'], word_1_distances, word_2_distances)
        current_count += 1

    results[word_1['word']] = current_sum / current_count

25000it [00:02, 8623.32it/s]
25000it [00:02, 8681.04it/s]
25000it [00:02, 10449.67it/s]
25000it [00:03, 6574.58it/s]
25000it [00:02, 10391.99it/s]
25000it [00:03, 7537.06it/s]
25000it [00:03, 7542.61it/s]
25000it [00:02, 8622.31it/s]
25000it [00:03, 7422.96it/s]
25000it [00:03, 7424.51it/s]
25000it [00:04, 5887.73it/s]
25000it [00:03, 6617.44it/s]
25000it [00:03, 7444.20it/s]
25000it [00:03, 7429.51it/s]
25000it [00:03, 7536.35it/s]
25000it [00:03, 7530.05it/s]
25000it [00:02, 8714.11it/s]
25000it [00:03, 6634.68it/s]
25000it [00:02, 8717.24it/s]
25000it [00:02, 10399.96it/s]
25000it [00:01, 12969.24it/s]
25000it [00:03, 6645.98it/s]
25000it [00:03, 7577.49it/s]
25000it [00:02, 10407.32it/s]
25000it [00:04, 5303.45it/s]
25000it [00:02, 10397.89it/s]
25000it [00:02, 10401.57it/s]
25000it [00:04, 5310.99it/s]
25000it [00:04, 5933.71it/s]
25000it [00:02, 8689.66it/s]
25000it [00:02, 8544.54it/s]
25000it [00:05, 4540.34it/s]
25000it [00:03, 7430.11it/s]
25000it [00:02, 10378.71it/s]
25000i

1it [00:20, 20.87s/it]
1it [00:17, 17.64s/it]

In [19]:
distances = pd.DataFrame.from_dict(results, orient='index', columns=['distance'])

In [20]:
distances.to_csv('/mnt/d/Projects/masters-thesis/data/distances_25k.csv')

In [48]:
distances = pd.read_csv('/mnt/d/Projects/masters-thesis/data/distances_25k.csv')
distances = distances.sort_values(by=['distance'])
distances.to_csv('/mnt/d/Projects/masters-thesis/data/distances_25k.csv')

In [25]:
distances.head()

Unnamed: 0.1,Unnamed: 0,distance
12403,по,77.505408
19363,ли,79.014382
93,е,79.321803
3966,на,79.67022
7029,по-,79.939854


In [33]:
core_words = ['по', 'ли', 'е', 'на', 'ни']
core_words_embeddings = [w2e.get_embedding(word) for word in core_words]

central_embedding = sum(core_words_embeddings) / 5
np.save('/mnt/d/Projects/masters-thesis/data/core_word_embedding_25k', central_embedding)

In [40]:
words_10k['distance'] = words_10k['embedding'].apply(lambda x: np.linalg.norm(x - central_embedding))

In [47]:
words_10k.to_csv('/mnt/d/Projects/masters-thesis/data/words_25k.csv', index=False)