## Обучаем нашу модель Word2Vec

In [75]:
import numpy as np

class Word2Vec:
    def __init__(self, sentences, vector_size=100, window=5, learning_rate=0.01, epochs=10):
        self.sentences = sentences
        self.vector_size = vector_size
        self.window = window
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.word_to_index = {}
        self.index_to_word = {}
        self.vocabulary_size = 0
        self.embeddings = None

    def build_vocabulary(self):
        unique_words = sorted(set(self.sentences))
        self.vocabulary_size = len(unique_words)
        self.word_to_index = {word: i for i, word in enumerate(unique_words)}
        self.index_to_word = {i: word for word, i in self.word_to_index.items()}

    def initialize_embeddings(self):
        self.embeddings = np.random.uniform(low=-0.5/self.vector_size, high=0.5/self.vector_size,
                                             size=(self.vocabulary_size, self.vector_size))

    def train(self):
        for epoch in range(self.epochs):
            for sentence in self.sentences:
                words = sentence.split()
                for i, target_word in enumerate(words):
                    target_index = self.word_to_index[target_word]
                    context_words = words[max(0, i - self.window):i] + words[i+1:i+self.window+1]
                    for context_word in context_words:
                        context_index = self.word_to_index[context_word]
                        predicted = np.dot(self.embeddings[target_index], self.embeddings[context_index])
                        error = predicted - 1  
                        self.embeddings[target_index] -= self.learning_rate * error * self.embeddings[context_index]
                        self.embeddings[context_index] -= self.learning_rate * error * self.embeddings[target_index]

    def get_word_vector(self, word):
        return self.embeddings[self.word_to_index[word]]

def train(data: str):
    sentences = data.split()
    
    word2vec = Word2Vec(sentences=sentences)
    word2vec.build_vocabulary()
    word2vec.initialize_embeddings()
    word2vec.train()

    w2v_dict = {word: word2vec.get_word_vector(word) for word in word2vec.word_to_index}
    
    return w2v_dict

In [76]:
train('horrible you son')

{'horrible': array([ 4.27320777e-03, -1.76177463e-03, -4.53445931e-03,  2.06743376e-03,
         3.29533845e-03, -1.17434070e-03,  1.74383917e-03,  1.80342949e-03,
        -3.11334506e-04,  8.30875941e-04,  4.44365802e-03,  3.21240648e-03,
         4.37942633e-03,  4.29619824e-03, -4.00015778e-03, -4.67390285e-03,
        -2.07541700e-03,  2.89535197e-03,  4.37561297e-03,  4.96777599e-03,
        -7.29420869e-04, -4.39413160e-03,  1.27794916e-03,  1.35834234e-03,
         1.28322164e-03, -1.36600476e-03, -2.69698488e-03, -5.75160950e-04,
         2.85520156e-03, -2.01622886e-03,  3.36874977e-03,  4.58491312e-04,
         1.70330488e-03, -3.99909517e-03, -1.13086991e-03, -9.56274278e-04,
         4.87545815e-03,  2.99313369e-03, -4.35469885e-03, -4.94853918e-03,
         1.62970881e-03,  3.05595812e-03, -7.87612292e-07, -3.37777198e-03,
         4.91055132e-04, -1.01703649e-03,  4.22208447e-04, -1.72387117e-03,
         2.84900889e-03, -3.45850154e-03,  3.25235680e-03,  5.33149972e-04,
