In [65]:
import numpy as np
from collections import deque
import nltk
import re
from nltk.corpus import brown
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('punkt')


class InputData:
    def __init__(self, sentences):
        self.sentences = sentences
        self.normalize()
        self.counter = 0
        self.wordId_frequency_dict = dict()
        self.word_count = 0  #  Number of words (repeated words only count as 1)
        self.word_count_sum = 0  # Total number of words (the number of repeated words also accumulates)
        self.sentence_count = 0  # Number of sentences
        self.id2word_dict = dict()
        self.word2id_dict = dict()
        self._init_dict()  # Initialize the dictionary
        self.sample_table = []
        self._init_sample_table()
        self.word_pairs_queue = deque()

        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)

    def normalize(self):
      stop_words = nltk.corpus.stopwords.words('english')
      norm_sentences_word_list = []
      for word_list in self.sentences:
        sentence = " ".join(word for word in word_list)
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        sentence = sentence.lower()
        sentence = re.sub(' +', ' ', sentence)
        sentence = sentence.strip()
        norm_word_list = sentence.split(' ')
        norm_word_list = [word for word in norm_word_list if word not in stop_words]
        if(len(norm_word_list) > 1):
          norm_sentences_word_list.append(norm_word_list)
       
      self.sentences = norm_sentences_word_list

    def _init_dict(self):
        word_freq = dict()
        for word_list in self.sentences:
            self.word_count_sum += len(word_list)
            self.sentence_count += 1
            for word in word_list:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
        word_id = 0
        # Initialize word2id_dict, id2word_dict, wordId_frequency_dict dictionary
        for per_word, per_count in word_freq.items():
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    def _init_sample_table(self):
        sample_table_size = 1e8
        frequency = np.array(list(self.wordId_frequency_dict.values())) ** 0.75
        frequency_sum = sum(frequency)
        ratio_array = frequency / frequency_sum 
        word_count_list = np.round(ratio_array * sample_table_size)
        for word_index, word_freq in enumerate(word_count_list):
            self.sample_table += [word_index] * int(word_freq)  # it generates a list, the content is the id of each word, each id in the list is repeated multiple times, the number of repetitions is the word frequency
        self.sample_table = np.array(self.sample_table)
        print(self.sample_table.shape)

    def generate_positive_pairs(self, window_size, neg_count):
        self.counter += 1
        if not self.sentences[20*(self.counter-1):20*self.counter]:
            self.counter = 1
            self.word_pairs_queue.clear()
        sub_wids = [[self.word2id_dict[word] for word in word_list] for word_list in self.sentences[20*(self.counter-1):20*self.counter]]


        for words in sub_wids:
          sentence_length = len(words)
          for index, center_word in enumerate(words):
            start = index - window_size
            end = index + window_size + 1

            context_words = []
            for index_2 in range(start,end):
              if 0 <= index_2 < sentence_length and index_2 != index:
                context_words.append(words[index_2])
              elif index_2 < 0 or index_2 >= sentence_length:
                context_words.append(self.word_count)
            
            negative_words = np.random.choice(self.sample_table, size=neg_count).tolist()

            self.word_pairs_queue.append((context_words, center_word, negative_words))
           


    def get_batch_pairs(self, batch_size, window_size, neg_count):

        while len(self.word_pairs_queue) < batch_size:
          self.generate_positive_pairs(window_size, neg_count)              
              
        result_pairs = []
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs


    def evaluate_pairs_count(self):
        return self.word_count_sum


def test():
    sentences = brown.sents(categories=['news'])[:2]
    test_data = InputData(sentences)
    print(" ".join(word for word in sentences[0]))
    print(" ".join(word for word in sentences[1]))
    pos_pairs = test_data.get_batch_pairs(10, 2, 8)
    print('positive:')
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]], [test_data.id2word_dict[i] for i in pair[2]]))
    print(pos_word_pairs)
    print(len(pos_pairs))

    pos_pairs = test_data.get_batch_pairs(10, 2, 8)
    print('positive:')
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]], [test_data.id2word_dict[i] for i in pair[2]]))
    print(pos_word_pairs)
    print(len(pos_pairs))


if __name__ == '__main__':
    test()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
(100000004,)
Word Count is: 29
Word Count Sum is 34
Sentence Count is: 2
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
positive:
[([29, 29, 1, 2], 0, [0, 16, 23, 23, 17, 3, 1, 16]), ([29,

In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

class CBOWModel(nn.Module):
    def __init__(self, emb_size, emb_dimension):
        super(CBOWModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(self.emb_size + 1, self.emb_dimension, sparse=True)
        self.w_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True)
        self._init_embedding()

    def _init_embedding(self):
        int_range = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-int_range, int_range)
        self.w_embeddings.weight.data.uniform_(-0, 0) 

    def compute_context_matrix(self, u):
        pos_u_emb = self.u_embeddings(torch.LongTensor(u))
        pos_u_emb = torch.mean(pos_u_emb, 1, True)
        pos_u_emb = pos_u_emb.squeeze()

        return pos_u_emb

    def forward(self, pos_u, pos_w, neg_w):
        pos_u_emb = self.compute_context_matrix(pos_u)
        pos_w_emb = self.w_embeddings(torch.LongTensor(pos_w))
        neg_w_emb = self.w_embeddings(torch.LongTensor(neg_w))

        score = torch.mul(pos_u_emb, pos_w_emb)
        score = torch.sum(score, dim=1).squeeze()
        score = F.logsigmoid(score)

        neg_score = torch.mul(neg_w_emb, pos_u_emb.unsqueeze(1))
        neg_score = torch.sum(neg_score, dim=2).squeeze()
        neg_score = F.logsigmoid(-1 * neg_score)
        neg_score = torch.sum(neg_score, dim=1).squeeze()

        final_score = score + neg_score
        loss = -1 * torch.sum(final_score)
        return loss

    def distance_matrix(self, word_count):
        embedding = self.u_embeddings.weight.data.numpy()[:word_count]
        distance_matrix = euclidean_distances(embedding)
        return distance_matrix


def test():
    model = CBOWModel(100, 2)

    pos_u = [[9, 1],[0, 1]]
    pos_w = [2, 4]
    neg_w = [[9, 1, 7, 3],[1, 7, 6, 8]]
    model.forward(pos_u, pos_w, neg_w)
    distance_matrix = model.distance_matrix(5)
    print(distance_matrix.shape)


if __name__ == '__main__':
    test()

(5, 5)


In [67]:
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import LambdaLR

# hyper parameters
WINDOW_SIZE = 2 
BATCH_SIZE = 1000  # mini-batch
EMB_DIMENSION = 100  # embedding dimension
LR = 0.05 # Learning rate
NEG_COUNT = 8


class Word2Vec:
    def __init__(self, sentences):
        self.data = InputData(sentences)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
        lambda1 = lambda epoch: 0.99 ** epoch
        self.scheduler = LambdaLR(self.optimizer, lr_lambda=lambda1)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count()
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        for epoch in range(1,51):
            mean_loss = 0
            process_bar = tqdm(range(int(batch_count)))
            for i in process_bar:
                pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE, NEG_COUNT)
                pos_u = [pair[0] for pair in pairs]
                pos_w = [int(pair[1]) for pair in pairs]
                neg_w = [pair[2] for pair in pairs]

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_w, neg_w)
                loss.backward()
                self.optimizer.step()
                mean_loss += loss

            print("epoch:",epoch,"loss:",mean_loss/int(batch_count))
            self.scheduler.step()


    def get_distance_matrix(self):
        distance_matrix = self.model.distance_matrix(self.data.word_count)
        return distance_matrix


In [68]:
sentences = brown.sents(categories=['news','reviews','government','hobbies','romance'])
w2v = Word2Vec(sentences)

(99999555,)
Word Count is: 24616
Word Count Sum is 170964
Sentence Count is: 17106


In [69]:
w2v.train()

SkipGram Training......
pairs_count 170964
batch_count 170.964


100%|██████████| 170/170 [00:11<00:00, 14.98it/s]


epoch: 1 loss: tensor(4943.3320, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.95it/s]


epoch: 2 loss: tensor(3797.7327, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.82it/s]


epoch: 3 loss: tensor(3656.9871, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.58it/s]


epoch: 4 loss: tensor(3663.9128, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.86it/s]


epoch: 5 loss: tensor(3495.8367, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.77it/s]


epoch: 6 loss: tensor(3482.2776, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.00it/s]


epoch: 7 loss: tensor(3429.0452, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.10it/s]


epoch: 8 loss: tensor(3260.6387, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.64it/s]


epoch: 9 loss: tensor(3324.9661, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.70it/s]


epoch: 10 loss: tensor(3288.6743, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.38it/s]


epoch: 11 loss: tensor(3176.7129, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.72it/s]


epoch: 12 loss: tensor(3108.5989, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.24it/s]


epoch: 13 loss: tensor(3109.5427, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.19it/s]


epoch: 14 loss: tensor(2912.3059, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.49it/s]


epoch: 15 loss: tensor(2889.6929, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.12it/s]


epoch: 16 loss: tensor(2801.9639, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.47it/s]


epoch: 17 loss: tensor(2822.6619, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.89it/s]


epoch: 18 loss: tensor(2767.0518, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.65it/s]


epoch: 19 loss: tensor(2629.7400, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.30it/s]


epoch: 20 loss: tensor(2562.6938, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 15.04it/s]


epoch: 21 loss: tensor(2504.3025, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.13it/s]


epoch: 22 loss: tensor(2377.1951, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.15it/s]


epoch: 23 loss: tensor(2343.3774, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.40it/s]


epoch: 24 loss: tensor(2283.5918, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 15.05it/s]


epoch: 25 loss: tensor(2167.3240, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.73it/s]


epoch: 26 loss: tensor(2073.1387, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.65it/s]


epoch: 27 loss: tensor(2140.4102, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.60it/s]


epoch: 28 loss: tensor(1941.3420, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.23it/s]


epoch: 29 loss: tensor(1889.7446, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.31it/s]


epoch: 30 loss: tensor(1815.5033, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.15it/s]


epoch: 31 loss: tensor(1756.8368, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.38it/s]


epoch: 32 loss: tensor(1733.6410, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.47it/s]


epoch: 33 loss: tensor(1616.5236, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.93it/s]


epoch: 34 loss: tensor(1558.0110, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.35it/s]


epoch: 35 loss: tensor(1575.6053, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.36it/s]


epoch: 36 loss: tensor(1460.5182, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.34it/s]


epoch: 37 loss: tensor(1400.6909, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.67it/s]


epoch: 38 loss: tensor(1381.3640, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.11it/s]


epoch: 39 loss: tensor(1319.0615, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.45it/s]


epoch: 40 loss: tensor(1250.5778, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.43it/s]


epoch: 41 loss: tensor(1250.2631, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:12<00:00, 14.03it/s]


epoch: 42 loss: tensor(1185.9552, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.41it/s]


epoch: 43 loss: tensor(1097.8555, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.48it/s]


epoch: 44 loss: tensor(1092.4512, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.57it/s]


epoch: 45 loss: tensor(1023.6960, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.78it/s]


epoch: 46 loss: tensor(1047.8884, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.57it/s]


epoch: 47 loss: tensor(979.2759, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.79it/s]


epoch: 48 loss: tensor(993.0318, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.22it/s]


epoch: 49 loss: tensor(931.0263, grad_fn=<DivBackward0>)


100%|██████████| 170/170 [00:11<00:00, 14.51it/s]


epoch: 50 loss: tensor(886.9894, grad_fn=<DivBackward0>)


In [70]:
distance_matrix = w2v.get_distance_matrix()

In [86]:
similar_words = {search_term: [w2v.data.id2word_dict[idx] for idx in distance_matrix[w2v.data.word2id_dict[search_term]].argsort()[1:10]] 
                   for search_term in ['tablespoon','sauce', 'democratic','football','bockwurst','university','smoke']}
similar_words

{'bockwurst': ['knackwurst',
  'bacillus',
  'finn',
  'neuralgia',
  'neuritis',
  'convair',
  'bologna',
  'zhitkov',
  'bespeak'],
 'democratic': ['gubernatorial',
  'republican',
  'candidate',
  'governor',
  'nomination',
  'behalf',
  'delta',
  'prince',
  'elected'],
 'football': ['rookie',
  'ny',
  'cincinnati',
  'basketball',
  'champions',
  'viet',
  'airlines',
  'nam',
  'connecticut'],
 'sauce': ['tablespoon',
  'tablespoons',
  'minced',
  'mustard',
  'pineapple',
  'teaspoon',
  'tile',
  'chili',
  'bowl'],
 'smoke': ['smoked',
  'mix',
  'smoothness',
  'boiling',
  'hurtling',
  'breeze',
  'smell',
  'pencil',
  'flicked'],
 'tablespoon': ['chili',
  'teaspoon',
  'worcestershire',
  'tablespoons',
  'tomato',
  'teaspoons',
  'honey',
  'toner',
  'minced'],
 'university': ['published',
  'graduate',
  'institute',
  'physics',
  'candidate',
  'awarded',
  'emory',
  'texas',
  'harris']}