In [1]:
class HuffmanNode:
    def __init__(self, word_id, frequency):
        self.word_id = word_id
        self.frequency = frequency
        self.left_child = None
        self.right_child = None
        self.father = None
        self.Huffman_code = []
        self.path = []


class HuffmanTree:
    def __init__(self, wordid_frequency_dict):
        self.word_count = len(wordid_frequency_dict)
        self.wordid_code = dict()
        self.wordid_path = dict()
        self.root = None
        unmerge_node_list = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # Unmerged node list
        self.huffman = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # Store all leaf nodes and intermediate nodes
        # Build huffman tree
        self.build_tree(unmerge_node_list)
        # Generate huffman code
        self.generate_huffman_code_and_path()

    def merge_node(self, node1, node2):
        sum_frequency = node1.frequency + node2.frequency
        mid_node_id = len(self.huffman)
        father_node = HuffmanNode(mid_node_id, sum_frequency)
        if node1.frequency >= node2.frequency:
            father_node.left_child = node1
            father_node.right_child = node2
        else:
            father_node.left_child = node2
            father_node.right_child = node1
        self.huffman.append(father_node)
        return father_node

    def build_tree(self, node_list):
        while len(node_list) > 1:
            i1 = 0  # Node with least frequency
            i2 = 1  # Node with the second smallest frequency
            if node_list[i2].frequency < node_list[i1].frequency:
                [i1, i2] = [i2, i1]
            for i in range(2, len(node_list)):
                if node_list[i].frequency < node_list[i2].frequency:
                    i2 = i
                    if node_list[i2].frequency < node_list[i1].frequency:
                        [i1, i2] = [i2, i1]
            father_node = self.merge_node(node_list[i1], node_list[i2])  # Combine the least frequent two nodes
            if i1 < i2:
                node_list.pop(i2)
                node_list.pop(i1)
            elif i1 > i2:
                node_list.pop(i1)
                node_list.pop(i2)
            else:
                raise RuntimeError('i1 should not be equal to i2')
            node_list.insert(0, father_node)  # Insert new node
        self.root = node_list[0]

    def generate_huffman_code_and_path(self):
        stack = [self.root]
        while len(stack) > 0:
            node = stack.pop()
            while node.left_child or node.right_child:
                code = node.Huffman_code
                path = node.path
                node.left_child.Huffman_code = code + [1]
                node.right_child.Huffman_code = code + [0]
                node.left_child.path = path + [node.word_id]
                node.right_child.path = path + [node.word_id]
                stack.append(node.right_child)
                node = node.left_child
            word_id = node.word_id
            word_code = node.Huffman_code
            word_path = node.path
            self.huffman[word_id].Huffman_code = word_code
            self.huffman[word_id].path = word_path
            # Write the Huffman code and path calculated by the node into the value of the dictionary
            self.wordid_code[word_id] = word_code
            self.wordid_path[word_id] = word_path

    # Get the id of all positive nodes and the id of all negative nodes
    def get_all_pos_and_neg_path(self):
        positive = []  # Array of positive paths of all words
        negative = []  # Array of negative paths for all words
        for word_id in range(self.word_count):
            pos_id = []
            neg_id = []
            for i, code in enumerate(self.huffman[word_id].Huffman_code):
                if code == 1:
                    pos_id.append(self.huffman[word_id].path[i])
                else:
                    neg_id.append(self.huffman[word_id].path[i])
            positive.append(pos_id)
            negative.append(neg_id)
        return positive, negative


def test():
    word_frequency = {0: 4, 1: 6, 2: 3, 3: 2, 4: 2}
    print(word_frequency)
    tree = HuffmanTree(word_frequency)
    print(tree.wordid_code)
    print(tree.wordid_path)
    for i in range(len(word_frequency)):
        print(tree.huffman[i].path)
    print(tree.get_all_pos_and_neg_path())


if __name__ == '__main__':
    test()

{0: 4, 1: 6, 2: 3, 3: 2, 4: 2}
{1: [1, 1], 0: [1, 0], 3: [0, 1, 1], 4: [0, 1, 0], 2: [0, 0]}
{1: [8, 7], 0: [8, 7], 3: [8, 6, 5], 4: [8, 6, 5], 2: [8, 6]}
[8, 7]
[8, 7]
[8, 6]
[8, 6, 5]
[8, 6, 5]
([[8], [8, 7], [], [6, 5], [6]], [[7], [], [8, 6], [8], [8, 5]])


In [4]:
import numpy as np
from collections import deque
import nltk
import re
import random
from nltk.corpus import brown
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('punkt')


class InputData:
    def __init__(self, sentences, sample):
        self.norm_sentences = []
        self.sample = sample
        self.counter = 0
        self.wordId_frequency_dict = dict()
        self.word_count = 0  # Number of words (repeated words only count as 1)
        self.word_count_sum = 0  # Total number of words (the number of repeated words also accumulates)
        self.sentence_count = 0  # Number of sentences
        self.id2word_dict = dict()
        self.word2id_dict = dict()
        self._init_dict(sentences)  # Initialize the dictionary
        self.subsampling()
        self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # Hoffman Tree
        self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path()
        self.word_pairs_queue = deque()

        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)
        print('Tree Node is:', len(self.huffman_tree.huffman))


    def subsampling(self):
        
        if self.sample > 0:
            self.word_count_sum = 0
            self.sentence_count = 0

            frequency = np.array(list(self.wordId_frequency_dict.values()))
            z = frequency / np.sum(frequency)
            p = (np.sqrt(z / self.sample) + 1) * (self.sample / z)

            new_norm_sentences = []
            for word_list in self.norm_sentences:
              word_list = [word for word in word_list if p[self.word2id_dict[word]] > random.random()]
              if len(word_list) >= 2:
                self.sentence_count += 1
                self.word_count_sum += len(word_list)
                new_norm_sentences.append(word_list)

            self.norm_sentences = new_norm_sentences

    def normalize(self, word_list):
      sentence = " ".join(word for word in word_list)
      sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
      sentence = sentence.lower()
      sentence = re.sub(' +', ' ', sentence)
      sentence = sentence.strip()
      norm_word_list = sentence.split(' ')
      if self.sample <= 0:
          stop_words = nltk.corpus.stopwords.words('english')
          norm_word_list_with_out_stop_words = [word for word in norm_word_list if word not in stop_words]
          norm_word_list = norm_word_list_with_out_stop_words

      return norm_word_list

    def _init_dict(self,sentences):
        word_freq = dict()
        for word_list in sentences:
            word_list = self.normalize(word_list)
            if(len(word_list) < 2):
                continue
            self.word_count_sum += len(word_list)
            self.sentence_count += 1
            for word in word_list:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
            self.norm_sentences.append(word_list)
        word_id = 0
        # Initialize word2id_dict, id2word_dict, wordId_frequency_dict dictionary
        for per_word, per_count in word_freq.items():
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    def generate_context_word_pairs(self, window_size):
      self.counter += 1
      if not self.norm_sentences[20*(self.counter-1):20*self.counter]:
        self.counter = 1
        self.word_pairs_queue.clear()
      sub_wids = [[self.word2id_dict[word] for word in word_list] for word_list in self.norm_sentences[20*(self.counter-1):20*self.counter]]
      context_length = window_size*2
      for words in sub_wids:
          sentence_length = len(words)
          for index, word in enumerate(words):         
              start = index - window_size
              end = index + window_size + 1

              context_words = []
              for i in range(start, end):
                  if 0 <= i < sentence_length and i != index:
                    context_words.append(words[i])
                  elif i < 0 or i >= sentence_length:
                    context_words.append(self.word_count)

              self.word_pairs_queue.append((context_words,word))

    # Get the positive sample pair (Xw,w) of the mini-batch size. Xw is the context id array, and w is the center word id. The context step size is window_size, ie 2c = 2*window_size
    def get_batch_pairs(self, batch_size, window_size):

        while len(self.word_pairs_queue) < batch_size:
          self.generate_context_word_pairs(window_size)

        result_pairs = []  # Returns the positive sample pair of mini-batch size
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs

    def get_pairs(self, pos_pairs):
        neg_word_pair = []
        pos_word_pair = []
        for pair in pos_pairs:
            pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]])
            neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]])
        return pos_word_pair, neg_word_pair


    def evaluate_pairs_count(self):
        return self.word_count_sum


# Test all methods
def test():
    sentences = brown.sents(categories=['news'])
    test_data = InputData(sentences,0.0002)
    print(" ".join(word for word in sentences[0]))
    print(" ".join(word for word in test_data.norm_sentences[0]))
    print(" ".join(word for word in sentences[1]))
    print(" ".join(word for word in test_data.norm_sentences[1]))
    pos_pairs = test_data.get_batch_pairs(10, 2)
    print(sentences[0])
    print(sentences[1])
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]]))
    print(pos_word_pairs)
    print('')
    print(test_data.huffman_pos_path[0])
    print(test_data.huffman_neg_path[0])
    pos, neg = test_data.get_pairs(pos_pairs)
    print(pos)
    print(neg)

    pos_word_pairs = []
    pos_pairs = test_data.get_batch_pairs(10, 2)
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]]))
    print(pos_pairs)
    print(pos_word_pairs)

    print(test_data.id2word_dict[4846])


if __name__ == '__main__':
    test()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Word Count is: 12125
Word Count Sum is 55642
Sentence Count is: 4538
Tree Node is: 24249
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
the fulton grand jury friday an investigation atlantas recent primary election produced evidence irregularities took place
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the C

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class CBOWModel(nn.Module):
    def __init__(self, emb_size, emb_dimension):
        super(CBOWModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)  # Define the embedded dictionary for the input word
        self.w_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)  # Define the embedded dictionary for the output word
        self._init_embedding()  # initialization

    def _init_embedding(self):
        int_range = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-int_range, int_range) #work
        self.w_embeddings.weight.data.uniform_(-0, 0) #work

    def compute_context_matrix(self, u):
        pos_u_emb = self.u_embeddings(torch.LongTensor(u))
        pos_u_emb = torch.mean(pos_u_emb, 1, True)
        pos_u_emb = pos_u_emb.squeeze()

        return pos_u_emb

    def forward(self, pos_u, pos_w, neg_u, neg_w):
        pos_u_emb = self.compute_context_matrix(pos_u)
        pos_w_emb = self.w_embeddings(torch.LongTensor(pos_w))
        neg_u_emb = self.compute_context_matrix(neg_u)
        neg_w_emb = self.w_embeddings(torch.LongTensor(neg_w))

        # Gradient descent（ result *（-1） Can become a loss function -> Gradient descent using torch）
        score_1 = torch.mul(pos_u_emb, pos_w_emb).squeeze()  # Xw.T * θu
        score_2 = torch.sum(score_1, dim=1)
        score_3 = F.logsigmoid(score_2)
        neg_score_1 = torch.mul(neg_u_emb, neg_w_emb).squeeze()  # Xw.T * θw
        neg_score_2 = torch.sum(neg_score_1, dim=1)
        neg_score_3 = F.logsigmoid(-1 * neg_score_2)
        # L = log sigmoid (Xw.T * θw) + logsigmoid (-Xw.T * θw)
        loss = torch.sum(score_3) + torch.sum(neg_score_3)
        return -1 * loss

    def distance_matrix(self, word_count):
        embedding = self.u_embeddings.weight.data.numpy()[:word_count]
        distance_matrix = euclidean_distances(embedding)
        return distance_matrix


def test():
    model = CBOWModel(100, 10)

    pos_u = [[9, 1, 2, 3],[0, 1, 2, 3]]
    pos_w = [50, 70]
    neg_u = [[9, 1, 2, 3],[0, 1, 2, 3]]
    neg_w = [30, 42]
    model.forward(pos_u, pos_w, neg_u, neg_w)
    distance_matrix = model.distance_matrix(5)
    print(distance_matrix.shape)


if __name__ == '__main__':
    test()

(5, 5)


In [7]:
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import LambdaLR

# hyper parameters
WINDOW_SIZE = 2 
BATCH_SIZE = 1000  # mini-batch
EMB_DIMENSION = 100  # embedding dimension
LR = 0.01  # Learning rate


class Word2Vec:
    def __init__(self,sentences,sample):
        self.data = InputData(sentences,sample)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
        lambda1 = lambda epoch: 0.99 ** epoch
        self.scheduler = LambdaLR(self.optimizer, lr_lambda=lambda1)

    def train(self):
        print("CBOW Training......")
        pairs_count = self.data.evaluate_pairs_count()
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        for epoch in range(1,21):
            mean_loss = 0
            process_bar = tqdm(range(int(batch_count)))
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
                pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)

                pos_u = [pair[0] for pair in pos_pairs]
                pos_v = [int(pair[1]) for pair in pos_pairs]
                neg_u = [pair[0] for pair in neg_pairs]
                neg_v = [int(pair[1]) for pair in neg_pairs]

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u,pos_v,neg_u,neg_v)
                loss.backward()
                self.optimizer.step()
                mean_loss += loss

            print("epoch:",epoch,"loss:",mean_loss/int(batch_count))
            self.scheduler.step()

    def get_distance_matrix(self):
        distance_matrix = self.model.distance_matrix(self.data.word_count)
        return distance_matrix

#def run():
    #sentences = brown.sents(categories=['news','reviews','humor','hobbies'])
    #w2v = Word2Vec(sentences)
    #w2v.train()
    
    #result = w2v.similar_words(['government','church','children','car','tax','food','election'])

    #print(result)            


#if __name__ == '__main__':
#    run()

In [8]:
sentences = brown.sents(categories=['news','reviews','government','hobbies','romance'])
SAMPLE = 0.0002 # use subsampling
w2v = Word2Vec(sentences,SAMPLE)

Word Count is: 24758
Word Count Sum is 196583
Sentence Count is: 17270
Tree Node is: 49515


In [37]:
w2v.train()

CBOW Training......
pairs_count 196583
batch_count 196.583


100%|██████████| 196/196 [00:24<00:00,  8.12it/s]


epoch: 1 loss: tensor(4780.1309, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.26it/s]


epoch: 2 loss: tensor(4764.9468, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.25it/s]


epoch: 3 loss: tensor(4747.4893, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.22it/s]


epoch: 4 loss: tensor(4746.4795, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.25it/s]


epoch: 5 loss: tensor(4726.6245, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.24it/s]


epoch: 6 loss: tensor(4706.8091, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.21it/s]


epoch: 7 loss: tensor(4696.3066, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.39it/s]


epoch: 8 loss: tensor(4692.0308, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.23it/s]


epoch: 9 loss: tensor(4673.9199, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.28it/s]


epoch: 10 loss: tensor(4662.0986, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:25<00:00,  7.75it/s]


epoch: 11 loss: tensor(4644.7568, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.19it/s]


epoch: 12 loss: tensor(4636.8027, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.19it/s]


epoch: 13 loss: tensor(4622.9019, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.28it/s]


epoch: 14 loss: tensor(4607.2778, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.21it/s]


epoch: 15 loss: tensor(4596.1562, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.25it/s]


epoch: 16 loss: tensor(4584.8252, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.37it/s]


epoch: 17 loss: tensor(4574.9326, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.20it/s]


epoch: 18 loss: tensor(4560.4326, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.24it/s]


epoch: 19 loss: tensor(4550.4282, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:23<00:00,  8.25it/s]

epoch: 20 loss: tensor(4540.1343, grad_fn=<DivBackward0>)





In [38]:
distance_matrix = w2v.get_distance_matrix()

In [69]:
similar_words = {search_term: [w2v.data.id2word_dict[idx] for idx in distance_matrix[w2v.data.word2id_dict[search_term]].argsort()[1:10]] 
                   for search_term in ['tax','sauce', 'democratic','mettwurst']}
similar_words

{'democratic': ['republican',
  'welton',
  'sukarno',
  'philharmonique',
  'couve',
  'nato',
  'dictators',
  'macdonald',
  'idol'],
 'mettwurst': ['bratwurst',
  'foam',
  'geddes',
  'macgregors',
  'cigar',
  'loews',
  'markers',
  'awnings',
  'simpleminded'],
 'sauce': ['basics',
  'cervelat',
  'celery',
  'tomato',
  'listenersupported',
  'lids',
  'flaky',
  'crimson',
  'sheathing'],
 'tax': ['expenditures',
  'grants',
  'january',
  'sum',
  'granted',
  'rights',
  'directed',
  'contract',
  'surveyed']}