In [1]:
class HuffmanNode:
    def __init__(self, word_id, frequency):
        self.word_id = word_id
        self.frequency = frequency
        self.left_child = None
        self.right_child = None
        self.father = None
        self.Huffman_code = []
        self.path = []


class HuffmanTree:
    def __init__(self, wordid_frequency_dict):
        self.word_count = len(wordid_frequency_dict)
        self.wordid_code = dict()
        self.wordid_path = dict()
        self.root = None
        unmerge_node_list = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # Unmerged node list
        self.huffman = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # Store all leaf nodes and intermediate nodes
        self.build_tree(unmerge_node_list)
        self.generate_huffman_code_and_path()

    def merge_node(self, node1, node2):
        sum_frequency = node1.frequency + node2.frequency
        mid_node_id = len(self.huffman)
        father_node = HuffmanNode(mid_node_id, sum_frequency)
        if node1.frequency >= node2.frequency:
            father_node.left_child = node1
            father_node.right_child = node2
        else:
            father_node.left_child = node2
            father_node.right_child = node1
        self.huffman.append(father_node)
        return father_node

    def build_tree(self, node_list):
        while len(node_list) > 1:
            i1 = 0
            i2 = 1
            if node_list[i2].frequency < node_list[i1].frequency:
                [i1, i2] = [i2, i1]
            for i in range(2, len(node_list)):
                if node_list[i].frequency < node_list[i2].frequency:
                    i2 = i
                    if node_list[i2].frequency < node_list[i1].frequency:
                        [i1, i2] = [i2, i1]
            father_node = self.merge_node(node_list[i1], node_list[i2])
            if i1 < i2:
                node_list.pop(i2)
                node_list.pop(i1)
            elif i1 > i2:
                node_list.pop(i1)
                node_list.pop(i2)
            else:
                raise RuntimeError('i1 should not be equal to i2')
            node_list.insert(0, father_node)
        self.root = node_list[0]

    def generate_huffman_code_and_path(self):
        stack = [self.root]
        while len(stack) > 0:
            node = stack.pop()
            while node.left_child or node.right_child:
                code = node.Huffman_code
                path = node.path
                node.left_child.Huffman_code = code + [1]
                node.right_child.Huffman_code = code + [0]
                node.left_child.path = path + [node.word_id]
                node.right_child.path = path + [node.word_id]
                stack.append(node.right_child)
                node = node.left_child
            word_id = node.word_id
            word_code = node.Huffman_code
            word_path = node.path
            self.huffman[word_id].Huffman_code = word_code
            self.huffman[word_id].path = word_path

            self.wordid_code[word_id] = word_code
            self.wordid_path[word_id] = word_path


    def get_all_pos_and_neg_path(self):
        positive = []
        negative = []
        for word_id in range(self.word_count):
            pos_id = []
            neg_id = []
            for i, code in enumerate(self.huffman[word_id].Huffman_code):
                if code == 1:
                    pos_id.append(self.huffman[word_id].path[i])
                else:
                    neg_id.append(self.huffman[word_id].path[i])
            positive.append(pos_id)
            negative.append(neg_id)
        return positive, negative


def test():
    word_frequency = {0: 4, 1: 6, 2: 3, 3: 2, 4: 2}
    print(word_frequency)
    tree = HuffmanTree(word_frequency)
    print(tree.wordid_code)
    print(tree.wordid_path)
    for i in range(len(word_frequency)):
        print(tree.huffman[i].path)
    print(tree.get_all_pos_and_neg_path())


if __name__ == '__main__':
    test()

{0: 4, 1: 6, 2: 3, 3: 2, 4: 2}
{1: [1, 1], 0: [1, 0], 3: [0, 1, 1], 4: [0, 1, 0], 2: [0, 0]}
{1: [8, 7], 0: [8, 7], 3: [8, 6, 5], 4: [8, 6, 5], 2: [8, 6]}
[8, 7]
[8, 7]
[8, 6]
[8, 6, 5]
[8, 6, 5]
([[8], [8, 7], [], [6, 5], [6]], [[7], [], [8, 6], [8], [8, 5]])


In [2]:
from collections import deque
import nltk
import re
from nltk.corpus import brown
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('punkt')


class InputData:
    def __init__(self, sentences):
        self.sentences = sentences
        self.normalize()
        self.counter = 0
        self.wordId_frequency_dict = dict()
        self.word_count = 0  # Number of words (repeated words only count as 1)
        self.word_count_sum = 0  # Total number of words (the number of repeated words also accumulates)
        self.sentence_count = 0  # Number of sentences
        self.id2word_dict = dict()
        self.word2id_dict = dict()
        self._init_dict()  # Initialize the dictionary
        self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # Hoffman Tree
        self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path()
        self.word_pairs_queue = deque()

        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)
        print('Tree Node is:', len(self.huffman_tree.huffman))

    def normalize(self):
      stop_words = nltk.corpus.stopwords.words('english')
      norm_sentences_word_list = []
      for word_list in self.sentences:
        sentence = " ".join(word for word in word_list)
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        sentence = sentence.lower()
        sentence = re.sub(' +', ' ', sentence)
        sentence = sentence.strip()
        norm_word_list = sentence.split(' ')
        norm_word_list = [word for word in norm_word_list if word not in stop_words]
        if(len(norm_word_list) > 1):
          norm_sentences_word_list.append(norm_word_list)
        
      self.sentences = norm_sentences_word_list

    def _init_dict(self):
        word_freq = dict()
        for word_list in self.sentences:
            self.word_count_sum += len(word_list)
            self.sentence_count += 1
            for word in word_list:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
        word_id = 0
        # Initialize word2id_dict, id2word_dict, wordId_frequency_dict dictionary
        for per_word, per_count in word_freq.items():
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    def generate_context_word_pairs(self, window_size):
      self.counter += 1
      if not self.sentences[20*(self.counter-1):20*self.counter]:
        self.counter = 1
        self.word_pairs_queue.clear()
      sub_wids = [[self.word2id_dict[word] for word in word_list] for word_list in self.sentences[20*(self.counter-1):20*self.counter]]
      context_length = window_size*2
      for words in sub_wids:
          sentence_length = len(words)
          for index, word in enumerate(words):         
              start = index - window_size
              end = index + window_size + 1

              context_words = []
              for i in range(start, end):
                  if 0 <= i < sentence_length and i != index:
                    context_words.append(words[i])
                  elif i < 0 or i >= sentence_length:
                    context_words.append(self.word_count)

              self.word_pairs_queue.append((context_words,word))


    def get_batch_pairs(self, batch_size, window_size):

        while len(self.word_pairs_queue) < batch_size:
          self.generate_context_word_pairs(window_size)

        result_pairs = [] 
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs

    def get_pairs(self, pos_pairs):
        neg_word_pair = []
        pos_word_pair = []
        for pair in pos_pairs:
            pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]])
            neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]])
        return pos_word_pair, neg_word_pair


    def evaluate_pairs_count(self):
        return self.word_count_sum


# Test all methods
def test():
    sentences = brown.sents(categories=['news','reviews','humor','hobbies','fiction'])
    test_data = InputData(sentences)
    pos_pairs = test_data.get_batch_pairs(10, 2)
    print(sentences[0])
    print(sentences[1])
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]]))
    print(pos_word_pairs)
    print('')
    print(test_data.huffman_pos_path[0])
    print(test_data.huffman_neg_path[0])
    pos, neg = test_data.get_pairs(pos_pairs)
    print(pos)
    print(neg)

    pos_word_pairs = []
    pos_pairs = test_data.get_batch_pairs(10, 2)
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]]))
    print(pos_pairs)
    print(pos_word_pairs)

    print(test_data.id2word_dict[4846])


if __name__ == '__main__':
    test()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Word Count is: 24794
Word Count Sum is 147000
Sentence Count is: 15146
Tree Node is: 49587
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class CBOWModel(nn.Module):
    def __init__(self, emb_size, emb_dimension):
        super(CBOWModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)
        self.w_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)
        self._init_embedding()

    def _init_embedding(self):
        int_range = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-int_range, int_range)
        self.w_embeddings.weight.data.uniform_(-0, 0)

    def compute_context_matrix(self, u):
        pos_u_emb = self.u_embeddings(torch.LongTensor(u))
        pos_u_emb = torch.mean(pos_u_emb, 1, True)
        pos_u_emb = pos_u_emb.squeeze()

        return pos_u_emb

    def forward(self, pos_u, pos_w, neg_u, neg_w):
        pos_u_emb = self.compute_context_matrix(pos_u)
        pos_w_emb = self.w_embeddings(torch.LongTensor(pos_w))
        neg_u_emb = self.compute_context_matrix(neg_u)
        neg_w_emb = self.w_embeddings(torch.LongTensor(neg_w))

        score_1 = torch.mul(pos_u_emb, pos_w_emb).squeeze()
        score_2 = torch.sum(score_1, dim=1)
        score_3 = F.logsigmoid(score_2)
        neg_score_1 = torch.mul(neg_u_emb, neg_w_emb).squeeze()
        neg_score_2 = torch.sum(neg_score_1, dim=1)
        neg_score_3 = F.logsigmoid(-1 * neg_score_2)

        loss = torch.sum(score_3) + torch.sum(neg_score_3)
        return -1 * loss

    def distance_matrix(self, word_count):
        embedding = self.u_embeddings.weight.data.numpy()[:word_count]
        distance_matrix = euclidean_distances(embedding)
        return distance_matrix


def test():
    model = CBOWModel(100, 10)

    pos_u = [[9, 1, 2, 3],[0, 1, 2, 3]]
    pos_w = [50, 70]
    neg_u = [[9, 1, 2, 3],[0, 1, 2, 3]]
    neg_w = [30, 42]
    model.forward(pos_u, pos_w, neg_u, neg_w)
    distance_matrix = model.distance_matrix(5)
    print(distance_matrix.shape)


if __name__ == '__main__':
    test()

(5, 5)


![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [189]:
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import LambdaLR

# hyper parameters
WINDOW_SIZE = 2 
BATCH_SIZE = 1000  # mini-batch
EMB_DIMENSION = 100  # embedding dimension
LR = 0.01  # Learning rate


class Word2Vec:
    def __init__(self,sentences):
        self.data = InputData(sentences)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
        lambda1 = lambda epoch: 0.99 ** epoch
        self.scheduler = LambdaLR(self.optimizer, lr_lambda=lambda1)

    def train(self):
        print("CBOW Training......")
        pairs_count = self.data.evaluate_pairs_count()
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        for epoch in range(1,51):
            mean_loss = 0
            process_bar = tqdm(range(int(batch_count)))
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
                pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)

                pos_u = [pair[0] for pair in pos_pairs]
                pos_v = [int(pair[1]) for pair in pos_pairs]
                neg_u = [pair[0] for pair in neg_pairs]
                neg_v = [int(pair[1]) for pair in neg_pairs]

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_u,neg_v)
                loss.backward()
                self.optimizer.step()
                mean_loss += loss

            print("epoch:",epoch,"loss:",mean_loss/int(batch_count))
            self.scheduler.step()

    def get_distance_matrix(self):
        distance_matrix = self.model.distance_matrix(self.data.word_count)
        return distance_matrix


In [190]:
sentences = brown.sents(categories=['news','reviews','government','hobbies','romance'])
w2v = Word2Vec(sentences)

Word Count is: 24616
Word Count Sum is 170964
Sentence Count is: 17106
Tree Node is: 49231


In [191]:
w2v.train()

CBOW Training......
pairs_count 170964
batch_count 170.964


100%|██████████| 170/170 [00:17<00:00,  9.90it/s]


epoch: 1 loss: tensor(8841.2080, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.65it/s]


epoch: 2 loss: tensor(8810.0869, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.72it/s]


epoch: 3 loss: tensor(8780.3594, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.66it/s]


epoch: 4 loss: tensor(8755.3984, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.63it/s]


epoch: 5 loss: tensor(8732.7959, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.52it/s]


epoch: 6 loss: tensor(8711.9150, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.79it/s]


epoch: 7 loss: tensor(8686.6914, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.19it/s]


epoch: 8 loss: tensor(8657.7236, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.04it/s]


epoch: 9 loss: tensor(8625.7793, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.36it/s]


epoch: 10 loss: tensor(8595.2041, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.77it/s]


epoch: 11 loss: tensor(8560.6621, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.62it/s]


epoch: 12 loss: tensor(8523.9795, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.44it/s]


epoch: 13 loss: tensor(8492.6328, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.62it/s]


epoch: 14 loss: tensor(8469.0488, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 15 loss: tensor(8426.6807, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.69it/s]


epoch: 16 loss: tensor(8391.4424, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 17 loss: tensor(8351.0381, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.90it/s]


epoch: 18 loss: tensor(8314.6299, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.64it/s]


epoch: 19 loss: tensor(8281.6641, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.65it/s]


epoch: 20 loss: tensor(8235.2422, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.60it/s]


epoch: 21 loss: tensor(8199.5020, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.87it/s]


epoch: 22 loss: tensor(8159.7983, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.61it/s]


epoch: 23 loss: tensor(8112.0059, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.67it/s]


epoch: 24 loss: tensor(8076.7905, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.67it/s]


epoch: 25 loss: tensor(8036.3833, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 26 loss: tensor(7977.1748, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.67it/s]


epoch: 27 loss: tensor(7939.7554, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.66it/s]


epoch: 28 loss: tensor(7896.9023, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.71it/s]


epoch: 29 loss: tensor(7866.6274, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.94it/s]


epoch: 30 loss: tensor(7811.5044, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.72it/s]


epoch: 31 loss: tensor(7763.9780, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 32 loss: tensor(7721.4404, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.73it/s]


epoch: 33 loss: tensor(7660.0801, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 34 loss: tensor(7640.9316, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.72it/s]


epoch: 35 loss: tensor(7579.2168, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.68it/s]


epoch: 36 loss: tensor(7527.4028, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 37 loss: tensor(7474.0015, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.71it/s]


epoch: 38 loss: tensor(7423.4414, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.72it/s]


epoch: 39 loss: tensor(7378.1914, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.66it/s]


epoch: 40 loss: tensor(7353.5796, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.69it/s]


epoch: 41 loss: tensor(7276.7256, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 42 loss: tensor(7238.9609, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.67it/s]


epoch: 43 loss: tensor(7206.8027, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.77it/s]


epoch: 44 loss: tensor(7144.5244, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.68it/s]


epoch: 45 loss: tensor(7103.2964, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.89it/s]


epoch: 46 loss: tensor(7048.5264, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.66it/s]


epoch: 47 loss: tensor(6994.1045, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 48 loss: tensor(6950.0889, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.68it/s]


epoch: 49 loss: tensor(6903.6265, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]

epoch: 50 loss: tensor(6862.5098, grad_fn=<DivBackward0>)





In [234]:
w2v.train()

CBOW Training......
pairs_count 170964
batch_count 170.964


100%|██████████| 170/170 [00:16<00:00, 10.45it/s]


epoch: 1 loss: tensor(6822.8618, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 2 loss: tensor(6776.9663, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.86it/s]


epoch: 3 loss: tensor(6728.5425, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.81it/s]


epoch: 4 loss: tensor(6681.2227, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.75it/s]


epoch: 5 loss: tensor(6636.5571, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.75it/s]


epoch: 6 loss: tensor(6586.4272, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.72it/s]


epoch: 7 loss: tensor(6541.8926, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.80it/s]


epoch: 8 loss: tensor(6487.2227, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.74it/s]


epoch: 9 loss: tensor(6456.5132, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 10 loss: tensor(6420.5566, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 11 loss: tensor(6363.1567, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 12 loss: tensor(6338.5376, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.90it/s]


epoch: 13 loss: tensor(6295.3848, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 14 loss: tensor(6238.7412, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 15 loss: tensor(6212.8950, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 16 loss: tensor(6175.8101, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.86it/s]


epoch: 17 loss: tensor(6145.2563, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.76it/s]


epoch: 18 loss: tensor(6084.4404, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.94it/s]


epoch: 19 loss: tensor(6058.4414, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 20 loss: tensor(6011.6333, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.83it/s]


epoch: 21 loss: tensor(5996.9009, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.90it/s]


epoch: 22 loss: tensor(5952.8589, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.97it/s]


epoch: 23 loss: tensor(5923.2637, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.97it/s]


epoch: 24 loss: tensor(5877.9985, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.87it/s]


epoch: 25 loss: tensor(5835.3218, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 11.00it/s]


epoch: 26 loss: tensor(5799.3154, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.97it/s]


epoch: 27 loss: tensor(5762.2046, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 28 loss: tensor(5725.9902, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 11.01it/s]


epoch: 29 loss: tensor(5718.3262, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 30 loss: tensor(5682.9409, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.86it/s]


epoch: 31 loss: tensor(5637.1162, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 32 loss: tensor(5606.1875, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 33 loss: tensor(5570.1470, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.93it/s]


epoch: 34 loss: tensor(5541.4102, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.98it/s]


epoch: 35 loss: tensor(5519.3867, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.78it/s]


epoch: 36 loss: tensor(5491.2104, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 37 loss: tensor(5461.8906, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.95it/s]


epoch: 38 loss: tensor(5446.5991, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 11.00it/s]


epoch: 39 loss: tensor(5408.8521, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.91it/s]


epoch: 40 loss: tensor(5372.3643, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.96it/s]


epoch: 41 loss: tensor(5357.7842, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.93it/s]


epoch: 42 loss: tensor(5321.8516, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 11.00it/s]


epoch: 43 loss: tensor(5291.3784, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.73it/s]


epoch: 44 loss: tensor(5267.9795, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.89it/s]


epoch: 45 loss: tensor(5251.5186, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.88it/s]


epoch: 46 loss: tensor(5216.6216, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.87it/s]


epoch: 47 loss: tensor(5203.5552, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.78it/s]


epoch: 48 loss: tensor(5171.8374, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.68it/s]


epoch: 49 loss: tensor(5152.9751, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.85it/s]

epoch: 50 loss: tensor(5114.2725, grad_fn=<DivBackward0>)





In [249]:
w2v.train()

CBOW Training......
pairs_count 170964
batch_count 170.964


100%|██████████| 170/170 [00:16<00:00, 10.53it/s]


epoch: 1 loss: tensor(5102.5322, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.56it/s]


epoch: 2 loss: tensor(5064.4951, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.50it/s]


epoch: 3 loss: tensor(5050.1001, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.34it/s]


epoch: 4 loss: tensor(5036.2583, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.63it/s]


epoch: 5 loss: tensor(5010.4775, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.59it/s]


epoch: 6 loss: tensor(4977.7085, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.59it/s]


epoch: 7 loss: tensor(4974.0146, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.56it/s]


epoch: 8 loss: tensor(4949.4673, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.51it/s]


epoch: 9 loss: tensor(4918.1392, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.45it/s]


epoch: 10 loss: tensor(4905.5825, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.56it/s]


epoch: 11 loss: tensor(4876.7261, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.59it/s]


epoch: 12 loss: tensor(4872.6035, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.55it/s]


epoch: 13 loss: tensor(4841.7837, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.44it/s]


epoch: 14 loss: tensor(4829.6616, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.49it/s]


epoch: 15 loss: tensor(4811.1875, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.58it/s]


epoch: 16 loss: tensor(4791.8203, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.59it/s]


epoch: 17 loss: tensor(4772.4551, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.61it/s]


epoch: 18 loss: tensor(4761.5708, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.52it/s]


epoch: 19 loss: tensor(4740.6978, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.60it/s]


epoch: 20 loss: tensor(4723.0171, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.53it/s]


epoch: 21 loss: tensor(4707.3638, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.57it/s]


epoch: 22 loss: tensor(4691.0894, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.53it/s]


epoch: 23 loss: tensor(4669.0996, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.58it/s]


epoch: 24 loss: tensor(4654.5996, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.39it/s]


epoch: 25 loss: tensor(4632.5337, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.52it/s]


epoch: 26 loss: tensor(4626.3394, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.58it/s]


epoch: 27 loss: tensor(4615.6431, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.69it/s]


epoch: 28 loss: tensor(4602.2163, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.70it/s]


epoch: 29 loss: tensor(4578.2715, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.56it/s]


epoch: 30 loss: tensor(4567.1416, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.61it/s]


epoch: 31 loss: tensor(4549.1123, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.39it/s]


epoch: 32 loss: tensor(4535.4277, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.65it/s]


epoch: 33 loss: tensor(4522.2109, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.50it/s]


epoch: 34 loss: tensor(4509.8789, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.54it/s]


epoch: 35 loss: tensor(4496.3628, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.47it/s]


epoch: 36 loss: tensor(4480.2578, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.27it/s]


epoch: 37 loss: tensor(4465.7432, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.27it/s]


epoch: 38 loss: tensor(4456.6841, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.51it/s]


epoch: 39 loss: tensor(4439.9180, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.36it/s]


epoch: 40 loss: tensor(4423.1167, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:17<00:00,  9.56it/s]


epoch: 41 loss: tensor(4418.0474, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.32it/s]


epoch: 42 loss: tensor(4403.8345, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.64it/s]


epoch: 43 loss: tensor(4387.2822, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.62it/s]


epoch: 44 loss: tensor(4384.0186, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:15<00:00, 10.67it/s]


epoch: 45 loss: tensor(4366.2412, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.53it/s]


epoch: 46 loss: tensor(4356.9517, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.61it/s]


epoch: 47 loss: tensor(4346.6733, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.55it/s]


epoch: 48 loss: tensor(4330.6133, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.56it/s]


epoch: 49 loss: tensor(4327.4160, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:16<00:00, 10.55it/s]

epoch: 50 loss: tensor(4310.2173, grad_fn=<DivBackward0>)





In [250]:
distance_matrix = w2v.get_distance_matrix()

In [261]:
similar_words = {search_term: [w2v.data.id2word_dict[idx] for idx in distance_matrix[w2v.data.word2id_dict[search_term]].argsort()[1:10]] 
                   for search_term in ['tablespoon','election','sauce', 'democratic','player','game','children','mettwurst']}
similar_words

{'children': ['restudy',
  'nursery',
  'chassis',
  'equines',
  'scripts',
  'remedial',
  'kroger',
  'mingle',
  'uttermost'],
 'democratic': ['republican',
  'voted',
  'swell',
  'georgia',
  'bronx',
  'unworkable',
  'alamein',
  'defendants',
  'morocco'],
 'election': ['forwarded',
  'curiously',
  'boissoneault',
  'oregon',
  'lowliest',
  'independents',
  'conventions',
  'unanimous',
  'byrds'],
 'game': ['scored',
  'gaines',
  'popped',
  'baylors',
  'sometimesnecessary',
  'phonies',
  'sprained',
  'triplecrown',
  'upstanding'],
 'mettwurst': ['bratwurst',
  'photofloodlights',
  'lobster',
  'europeans',
  'jerusalem',
  'teaspoonful',
  'loneliness',
  'solitude',
  'toasted'],
 'player': ['burnings',
  'nagle',
  'impudent',
  'overloud',
  'outplayed',
  'winking',
  'incomparably',
  'dyer',
  'sermons'],
 'sauce': ['pineapple',
  'bread',
  'frankfurter',
  'minced',
  'tiers',
  'walnuts',
  'chicken',
  'franks',
  'needle'],
 'tablespoon': ['horseradish',
