In [175]:
%tensorflow_version 2.x

In [176]:
import numpy as np
import keras.backend as K
import tensorflow as tf
import operator
from tensorflow import keras
from keras.utils import np_utils

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
import matplotlib.pyplot as plt

import pandas as pd

### Import file

In [177]:
file_name = 'alice.txt'
corpus = open(file_name).readlines()

In [178]:
#corpus

### Data preprocessing


In [179]:
# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
#print(corpus)
# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

In [180]:
n_samples, V

(27165, 2557)

In [181]:
#corpus

In [182]:
# Example of how word to integer mapping looks like in the tokenizer
#print(list((tokenizer.word_index.items())))

In [183]:
# Parameters
window_size = 2 
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)

## CBOW


In [184]:
from keras.preprocessing import sequence

# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words 
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))
                 
    return (np.array(all_in), np.array(all_out))

In [186]:
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
#print(list((tokenizer.word_index.items())))
print('X_cbow = ',X_cbow,'\ny_cbow = ',y_cbow)
#print('V = ',V)

X_cbow =  [[   0    0    7   38]
 [   0  305   38    1]
 [ 305    7    1   92]
 ...
 [2556 1426    0    0]
 [   0    0  215    0]
 [   0    1    0    0]] 
y_cbow =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [187]:
# Create the CBOW architecture
dim = 2

#for dim in dims:
cbow = Sequential()

    # Add an Embedding layer
cbow.add(Embedding(input_dim=V, 
                   output_dim=dim,
                   input_length=window_size*2, # Note that we now have 2L words for each input entry
                   embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
             loss='categorical_crossentropy',
             metrics=['accuracy'])
    
cbow.summary()
print("")
#cbow_models.append(cbow)
#cbow.get_weights()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 4, 2)              5114      
_________________________________________________________________
lambda_5 (Lambda)            (None, 2)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 2557)              7671      
Total params: 12,785
Trainable params: 12,785
Non-trainable params: 0
_________________________________________________________________



In [188]:
# Train CBOW model
cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
print("")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



In [190]:
# Save embeddings for vectors of length 50, 150 and 300 using cbow model
weights = cbow.get_weights()

# Get the embedding matrix
embedding = weights[0]

#weights



In [191]:
words = list((tokenizer.word_index.items()))
words.insert(0,('unkown',0))
#words

In [192]:
#embedding

In [None]:
fig , ax = plt.subplots(1,1,figsize = (10,10))
for i in range(len(embedding)):
  plt.scatter(embedding[i][0],embedding[i][1]) 
  ax.annotate(words[i][0],(embedding[i][0] + 0.01,embedding[i][1] + 0.01))
plt.show()

In [193]:
def embed(word, embedding, vocab_size=V, tokenizer=tokenizer):
    int_word = tokenizer.texts_to_sequences([word])[0]
    bin_word = to_categorical(int_word, V)
    return np.dot(bin_word, embedding)

In [194]:
embed('alice',embedding)

array([[-1.1403193, -2.5454345]], dtype=float32)

In [195]:
def compute_distance(word_a, word_b):
    
    point1 = embed(word_a,embedding)
    point2 = embed(word_b,embedding)
    
    return np.linalg.norm(point1 - point2)

In [196]:
compute_distance('king','queen')

1.0741676

In [197]:
def know_word(y_cbow):
  for i in range(len(y_cbow)):
    if y_cbow[i] == 1 :
      break
  return words[i][0]  

In [198]:
know_word(y_cbow[0])

'chapter'

In [199]:
def output(input):
  input_tokens = tokenizer.texts_to_sequences(input)
  for i in range(len(cbow.predict(input_tokens)[0])):
    if cbow.predict(input_tokens)[0][i] == cbow.predict(input_tokens)[0].max() :
      break
  return words[i][0]

In [200]:
input_words = [input("Enter the neighbouring words (separated by spaces) :")]

Enter the neighbouring words (separated by spaces) :chapter i the rabbit


In [201]:
input_words

['chapter i the rabbit']

In [202]:
output(input_words)

'of'