In [53]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences

In [54]:
data = pd.read_csv("CBOW.txt", delimiter="\t")
data = [str(w) for w in data]

In [55]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index

word2id['PAD']=0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size=100
window_size=2

print('Vocabulary size:', vocab_size)
print('Sample:', list(word2id.items())[:10])

Vocabulary size: 103
Sample: [('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]


In [56]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
  context_length = window_size*2
  for words in corpus:
    sentence_length=len(words)
    for index, word in enumerate(words):
      context_words=[]
      label_word=[]
      start=index-window_size
      end=index+window_size

      context_words.append([words[i]
                            for i in range(start,end)
                            if 0 <= i < sentence_length
                            and i != index])
      label_word.append(word)

      x=pad_sequences(context_words, maxlen=context_length)
      y=to_categorical(label_word, vocab_size)
      yield(x,y)

In [57]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
cbow.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            10300     
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 103)               10403     
                                                                 
Total params: 20703 (80.87 KB)
Trainable params: 20703 (80.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
for epoch in range(1,6):
  loss=0.
  i=0
  for x,y in generate_context_word_pairs(corpus=wids,window_size=window_size, vocab_size=vocab_size):
    i+=1
    loss+=cbow.train_on_batch(x,y)

  print('Epoch:', epoch, '\tLoss:', loss)
  print()

Epoch: 1 	Loss: 916.4925127029419

Epoch: 2 	Loss: 901.4746007919312

Epoch: 3 	Loss: 883.7567415237427

Epoch: 4 	Loss: 863.2917590141296

Epoch: 5 	Loss: 843.6588354110718



In [59]:
weights = cbow.get_weights()[0]
weights = weights[1:]
weights.shape

(102, 100)

In [60]:
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,0.055368,-0.066042,-0.044725,0.054242,-0.053776,0.130094,-0.012616,0.019587,0.059351,-0.016737,...,-0.02982,0.019066,0.079286,0.008673,0.053391,0.07127,-0.010983,0.003317,-0.080051,0.051619
influenza,0.089316,0.011564,-0.026145,-0.083011,0.103323,0.027468,-0.021936,0.087958,0.060457,-0.004837,...,0.072365,0.071654,0.025744,-0.084533,-0.113732,0.040641,-0.081577,0.029842,0.048326,0.008672
covid,-0.00887,-0.001028,-0.020652,-0.076763,0.005938,-0.113722,0.012325,0.053639,-0.060395,0.050624,...,-0.090581,0.084209,0.01646,-0.00893,-0.021826,0.094017,-0.03496,-0.005352,-0.064784,-0.10587
19,0.064,-0.196847,-0.094278,0.031964,0.054366,0.056543,-0.081508,0.142556,-0.074459,0.012444,...,-0.20093,0.059207,-0.014423,-0.142176,-0.080488,0.17307,-0.138428,0.060715,0.080645,-0.119129
virus,0.107789,-0.091353,-0.014439,-0.05707,0.12425,-0.054727,-0.024952,0.045594,0.114926,-0.172243,...,-0.135016,0.12304,-0.099052,-0.100822,-0.122136,0.094408,-0.116616,-0.048506,0.130984,-0.0922


In [61]:
from sklearn.metrics import euclidean_distances
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(102, 102)


In [62]:
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                 for search_term in ['covid']}

similar_words

{'covid': ['virus', 'both', 'for', 'higher', 'further']}