In [None]:
import numpy as np
import matplotlib.pyplot as plt
from utils import preprocess_text, build_vocabulary, generate_skip_grams, generate_cbow_pairs, data_import
from models.skip_gram import SkipGram
from models.cbow import CBOW
import random
from pathlib import Path
from db import load_data_to_db, import_texts

In [None]:
DB_NAME = 'db.sqlite3'

In [None]:
if Path(DB_NAME).exists():
    text_corpus = import_texts(DB_NAME, limit=5000)
else:
    text_corpus = data_import(page_limit=5000)
    load_data_to_db(text_corpus, DB_NAME)

In [None]:
processed_corpus = preprocess_text(text_corpus)
vocab, index_to_word = build_vocabulary(processed_corpus)
skip_grams = generate_skip_grams(processed_corpus)
cbow_pairs = generate_cbow_pairs(processed_corpus)

In [None]:
# возьмем несколько текстов, чтобы проверить логическую связь слов после обучения
random_text_subset = random.sample(text_corpus, 3)

In [None]:
processed_test_texts = preprocess_text(random_text_subset)
test_vocab, _ = build_vocabulary(processed_test_texts)

In [None]:
EMBEDDING_DIM = 100

In [None]:
skip_gram_model = SkipGram(len(vocab), EMBEDDING_DIM, vocab)    
skip_gram_losses = skip_gram_model.train(skip_grams, epochs=100)

In [None]:
plt.plot(np.arange(len(skip_gram_losses)), skip_gram_losses)
plt.xlabel('Training step')
plt.ylabel('Loss')
plt.title('Loss for skip-gram')
plt.show()

In [None]:
cbow_models = CBOW(len(vocab), EMBEDDING_DIM, vocab)
cbow_losses = cbow_models.train(cbow_pairs, epochs=100)

In [None]:
plt.plot(np.arange(len(cbow_losses)), cbow_losses)
plt.xlabel('Training step')
plt.ylabel('Loss')
plt.title('Loss for cbow')
plt.show()

In [None]:
# получаем список индексов
word_idxs = []
for word in test_vocab:
    word_idxs.append(vocab[word])

# получаем эмбеддинги для skip-gram и cbow
skip_gram_embeddings = []
for word in test_vocab:
    skip_gram_embeddings.append(skip_gram_model.embed(word))

cbow_embeddings = []
for word in test_vocab:
    cbow_embeddings.append(cbow_models.embed(word))

In [None]:
skip_gram_embeddings = np.array(skip_gram_embeddings)
cbow_embeddings = np.array(cbow_embeddings)

In [None]:
from utils import reduce_to_k_dim, plot_embeddings

In [None]:
reduced_skip_gram_embeddings = reduce_to_k_dim(skip_gram_embeddings)
reduced_cbow_embeddings = reduce_to_k_dim(cbow_embeddings)

In [None]:
plot_embeddings(reduced_skip_gram_embeddings, test_vocab)

In [None]:
plot_embeddings(reduced_cbow_embeddings, test_vocab)