In [1]:
import numpy as np
from collections import deque
import nltk
import re
import random
from nltk.corpus import brown
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('punkt')


class InputData:
    def __init__(self, sentences, sample):
        self.norm_sentences = []
        self.counter = 0
        self.sample = sample
        self.wordId_frequency_dict = dict()
        self.word_count = 0  #  Number of words
        self.word_count_sum = 0  # Total number of words
        self.sentence_count = 0  # Number of sentences
        self.id2word_dict = dict()
        self.word2id_dict = dict()
        self._init_dict(sentences)  # Initialize the dictionary
        self.subsampling()
        self.sample_table = []
        self._init_sample_table()
        self.word_pairs_queue = deque()

        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)


    def subsampling(self):
        
        if self.sample > 0:
            self.word_count_sum = 0
            self.sentence_count = 0

            frequency = np.array(list(self.wordId_frequency_dict.values()))
            z = frequency / np.sum(frequency)
            p = (np.sqrt(z / self.sample) + 1) * (self.sample / z)

            new_norm_sentences = []
            for word_list in self.norm_sentences:
              word_list = [word for word in word_list if p[self.word2id_dict[word]] > random.random()]
              if len(word_list) >= 2:
                self.sentence_count += 1
                self.word_count_sum += len(word_list)
                new_norm_sentences.append(word_list)

            self.norm_sentences = new_norm_sentences

    def normalize(self, word_list):
      sentence = " ".join(word for word in word_list)
      sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
      sentence = sentence.lower()
      sentence = re.sub(' +', ' ', sentence)
      sentence = sentence.strip()
      norm_word_list = sentence.split(' ')
      if self.sample <= 0:
          stop_words = nltk.corpus.stopwords.words('english')
          norm_word_list_with_out_stop_words = [word for word in norm_word_list if word not in stop_words]
          norm_word_list = norm_word_list_with_out_stop_words

      return norm_word_list

    def _init_dict(self,sentences):
        word_freq = dict()
        for word_list in sentences:
            word_list = self.normalize(word_list)
            if(len(word_list) < 2):
                continue
            self.word_count_sum += len(word_list)
            self.sentence_count += 1
            for word in word_list:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
            self.norm_sentences.append(word_list)
        word_id = 0
        for per_word, per_count in word_freq.items():
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    def _init_sample_table(self):
        sample_table_size = 1e8
        frequency = np.array(list(self.wordId_frequency_dict.values())) ** 0.75
        frequency_sum = sum(frequency)
        ratio_array = frequency / frequency_sum 
        word_count_list = np.round(ratio_array * sample_table_size)
        for word_index, word_freq in enumerate(word_count_list):
            self.sample_table += [word_index] * int(word_freq)  # it generates a list, the content is the id of each word, each id in the list is repeated multiple times, the number of repetitions is the word frequency
        self.sample_table = np.array(self.sample_table)
        print(self.sample_table.shape)

    def generate_positive_pairs(self, window_size, neg_count):
        self.counter += 1
        if not self.norm_sentences[20*(self.counter-1):20*self.counter]:
            self.counter = 1
            self.word_pairs_queue.clear()
        sub_wids = [[self.word2id_dict[word] for word in word_list] for word_list in self.norm_sentences[20*(self.counter-1):20*self.counter]]

        for words in sub_wids:
          sentence_length = len(words)
          for index, center_word in enumerate(words):
            start = index - window_size
            end = index + window_size + 1

            context_words = []
            for index_2 in range(start,end):
              if 0 <= index_2 < sentence_length and index_2 != index:
                context_words.append(words[index_2])
              elif index_2 < 0 or index_2 >= sentence_length:
                context_words.append(self.word_count)
            
            negative_words = np.random.choice(self.sample_table, size=neg_count).tolist()

            self.word_pairs_queue.append((context_words, center_word, negative_words))
           


    def get_batch_pairs(self, batch_size, window_size, neg_count):

        while len(self.word_pairs_queue) < batch_size:
          self.generate_positive_pairs(window_size, neg_count)              
              
        result_pairs = []
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs


    def evaluate_pairs_count(self):
        return self.word_count_sum


def test():
    sentences = brown.sents(categories=['news'])
    test_data = InputData(sentences,0.0002)
    print(" ".join(word for word in sentences[0]))
    print(" ".join(word for word in test_data.norm_sentences[0]))
    print(" ".join(word for word in sentences[1]))
    print(" ".join(word for word in test_data.norm_sentences[1]))
    pos_pairs = test_data.get_batch_pairs(10, 2, 8)
    print('positive:')
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]], [test_data.id2word_dict[i] for i in pair[2]]))
    print(pos_word_pairs)
    print(len(pos_pairs))

    pos_pairs = test_data.get_batch_pairs(10, 2, 8)
    print('positive:')
    print(pos_pairs)
    pos_word_pairs = []
    for pair in pos_pairs:
        pos_word_pairs.append(([test_data.id2word_dict[i] for i in pair[0] if i != test_data.word_count], test_data.id2word_dict[pair[1]], [test_data.id2word_dict[i] for i in pair[2]]))
    print(pos_word_pairs)
    print(len(pos_pairs))


if __name__ == '__main__':
    test()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
(100002346,)
Word Count is: 12125
Word Count Sum is 55596
Sentence Count is: 4537
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
fulton county grand jury friday investigation of atlantas recent primary election produced no evidence that any irregularities took place
The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

class CBOWModel(nn.Module):
    def __init__(self, emb_size, emb_dimension):
        super(CBOWModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(self.emb_size + 1, self.emb_dimension, sparse=True)  # Define the embedded dictionary for the input word
        self.w_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True)  # Define the embedded dictionary for the output word
        self._init_embedding()  # initialization

    def _init_embedding(self):
        int_range = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-int_range, int_range)
        self.w_embeddings.weight.data.uniform_(-0, 0) 

    def compute_context_matrix(self, u):
        pos_u_emb = self.u_embeddings(torch.LongTensor(u))
        pos_u_emb = torch.mean(pos_u_emb, 1, True)
        pos_u_emb = pos_u_emb.squeeze()

        return pos_u_emb

    def forward(self, pos_u, pos_w, neg_w):
        pos_u_emb = self.compute_context_matrix(pos_u)
        pos_w_emb = self.w_embeddings(torch.LongTensor(pos_w))
        neg_w_emb = self.w_embeddings(torch.LongTensor(neg_w))

        score = torch.mul(pos_u_emb, pos_w_emb)
        score = torch.sum(score, dim=1).squeeze()
        score = F.logsigmoid(score)

        neg_score = torch.mul(neg_w_emb, pos_u_emb.unsqueeze(1))
        neg_score = torch.sum(neg_score, dim=2).squeeze()
        neg_score = F.logsigmoid(-1 * neg_score)
        neg_score = torch.sum(neg_score, dim=1).squeeze()

        final_score = score + neg_score
        loss = -1 * torch.sum(final_score)
        return loss

    def distance_matrix(self, word_count):
        embedding = self.u_embeddings.weight.data.numpy()[:word_count]
        distance_matrix = euclidean_distances(embedding)
        return distance_matrix


def test():
    model = CBOWModel(100, 2)

    pos_u = [[9, 1],[0, 1]]
    pos_w = [2, 4]
    neg_w = [[9, 1, 7, 3],[1, 7, 6, 8]]
    model.forward(pos_u, pos_w, neg_w)
    distance_matrix = model.distance_matrix(5)
    print(distance_matrix.shape)


if __name__ == '__main__':
    test()

(5, 5)


In [3]:
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import LambdaLR

# hyper parameters
WINDOW_SIZE = 2 
BATCH_SIZE = 1000  # mini-batch
EMB_DIMENSION = 100  # embedding dimension
LR = 0.05 # Learning rate
NEG_COUNT = 8


class Word2Vec:
    def __init__(self, sentences, sample):
        self.data = InputData(sentences, sample)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
        lambda1 = lambda epoch: 0.99 ** epoch
        self.scheduler = LambdaLR(self.optimizer, lr_lambda=lambda1)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count()
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        for epoch in range(1,51):
            mean_loss = 0
            process_bar = tqdm(range(int(batch_count)))
            for i in process_bar:
                pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE, NEG_COUNT)
                pos_u = [pair[0] for pair in pairs]
                pos_w = [int(pair[1]) for pair in pairs]
                neg_w = [pair[2] for pair in pairs]

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_w, neg_w)
                loss.backward()
                self.optimizer.step()
                mean_loss += loss

            print("epoch:",epoch,"loss:",mean_loss/int(batch_count))
            self.scheduler.step()


    def get_distance_matrix(self):
        distance_matrix = self.model.distance_matrix(self.data.word_count)
        return distance_matrix


In [4]:
sentences = brown.sents(categories=['news','reviews','government','hobbies','romance'])
SAMPLE = 0.0002 # use subsampling
w2v = Word2Vec(sentences, SAMPLE)

(100002426,)
Word Count is: 24758
Word Count Sum is 196848
Sentence Count is: 17277


In [6]:
w2v.train()

SkipGram Training......
pairs_count 196848
batch_count 196.848


100%|██████████| 196/196 [00:12<00:00, 15.35it/s]


epoch: 1 loss: tensor(4478.0581, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.72it/s]


epoch: 2 loss: tensor(3574.5586, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.90it/s]


epoch: 3 loss: tensor(3423.7976, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.91it/s]


epoch: 4 loss: tensor(3421.0486, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.32it/s]


epoch: 5 loss: tensor(3394.6873, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.76it/s]


epoch: 6 loss: tensor(3310.4585, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.82it/s]


epoch: 7 loss: tensor(3312.1855, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.71it/s]


epoch: 8 loss: tensor(3237.0649, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.48it/s]


epoch: 9 loss: tensor(3174.7947, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.76it/s]


epoch: 10 loss: tensor(3157.9053, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.13it/s]


epoch: 11 loss: tensor(3007.0750, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.34it/s]


epoch: 12 loss: tensor(2982.8210, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.96it/s]


epoch: 13 loss: tensor(2912.1064, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 16.01it/s]


epoch: 14 loss: tensor(2839.3059, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 15.02it/s]


epoch: 15 loss: tensor(2759.7563, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.57it/s]


epoch: 16 loss: tensor(2634.8887, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.37it/s]


epoch: 17 loss: tensor(2619.8528, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.48it/s]


epoch: 18 loss: tensor(2465.1301, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.56it/s]


epoch: 19 loss: tensor(2438.0911, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.92it/s]


epoch: 20 loss: tensor(2340.3450, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 15.05it/s]


epoch: 21 loss: tensor(2208.0608, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.87it/s]


epoch: 22 loss: tensor(2103.8403, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.77it/s]


epoch: 23 loss: tensor(2045.8989, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.61it/s]


epoch: 24 loss: tensor(1979.8619, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.64it/s]


epoch: 25 loss: tensor(1876.6068, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.70it/s]


epoch: 26 loss: tensor(1838.4877, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.10it/s]


epoch: 27 loss: tensor(1796.0032, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.77it/s]


epoch: 28 loss: tensor(1651.6262, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.36it/s]


epoch: 29 loss: tensor(1626.6632, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.20it/s]


epoch: 30 loss: tensor(1600.1187, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.21it/s]


epoch: 31 loss: tensor(1517.2380, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.28it/s]


epoch: 32 loss: tensor(1480.1741, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.59it/s]


epoch: 33 loss: tensor(1412.3494, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.40it/s]


epoch: 34 loss: tensor(1368.3174, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.81it/s]


epoch: 35 loss: tensor(1292.1074, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.80it/s]


epoch: 36 loss: tensor(1251.9280, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.42it/s]


epoch: 37 loss: tensor(1249.6040, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.14it/s]


epoch: 38 loss: tensor(1187.7280, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.19it/s]


epoch: 39 loss: tensor(1144.9365, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.97it/s]


epoch: 40 loss: tensor(1119.3293, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:14<00:00, 13.50it/s]


epoch: 41 loss: tensor(1073.5837, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.20it/s]


epoch: 42 loss: tensor(1057.5358, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 15.06it/s]


epoch: 43 loss: tensor(1010.0784, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.96it/s]


epoch: 44 loss: tensor(983.0143, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.66it/s]


epoch: 45 loss: tensor(903.2000, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.98it/s]


epoch: 46 loss: tensor(988.5723, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.32it/s]


epoch: 47 loss: tensor(904.3854, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:12<00:00, 15.64it/s]


epoch: 48 loss: tensor(883.0977, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.47it/s]


epoch: 49 loss: tensor(861.6632, grad_fn=<DivBackward0>)


100%|██████████| 196/196 [00:13<00:00, 14.89it/s]

epoch: 50 loss: tensor(814.1398, grad_fn=<DivBackward0>)





In [8]:
distance_matrix = w2v.get_distance_matrix()

In [14]:
similar_words = {search_term: [w2v.data.id2word_dict[idx] for idx in distance_matrix[w2v.data.word2id_dict[search_term]].argsort()[1:10]] 
                   for search_term in ['tablespoon','sauce', 'republican','football','bockwurst','university']}
similar_words

{'bockwurst': ['knackwurst',
  'benzedrine',
  'bacillus',
  'zhitkov',
  'apergillus',
  'bologna',
  'selfdeceptions',
  'orzae',
  'kob'],
 'football': ['basketball',
  'cincinnati',
  'dakota',
  'ron',
  'milwaukee',
  'ave',
  'homer',
  'prince',
  'arkansas'],
 'republican': ['nomination',
  'fbi',
  'gubernatorial',
  'casey',
  'van',
  'allen',
  'rep',
  'howard',
  'barnard'],
 'sauce': ['tablespoons',
  'tablespoon',
  'chili',
  'creekturn',
  'minced',
  'teaspoon',
  'mustard',
  'cloth',
  'bread'],
 'tablespoon': ['teaspoon',
  'worcestershire',
  'chili',
  'pickled',
  'dia',
  'teaspoons',
  'toner',
  'vinegar',
  'canned'],
 'university': ['emory',
  'commissioner',
  'queen',
  'mathematics',
  'universitys',
  'showmanship',
  'houston',
  'republicans',
  'formerly']}