In [99]:
%tensorflow_version 2.x

In [100]:
import numpy as np
import keras.backend as K
import tensorflow as tf
import operator
from tensorflow import keras
from keras.utils import np_utils

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
import matplotlib.pyplot as plt

import pandas as pd

### Import file

In [148]:
file_name = 'alice.txt'
corpus = open(file_name).readlines()

In [149]:
corpus

['\n',
 '\n',
 'CHAPTER I. Down the Rabbit-Hole\n',
 '\n',
 'Alice was beginning to get very tired of sitting by her sister on the\n',
 'bank, and of having nothing to do: once or twice she had peeped into the\n',
 'book her sister was reading, but it had no pictures or conversations in\n',
 "it, 'and what is the use of a book,' thought Alice 'without pictures or\n",
 "conversations?'\n",
 '\n',
 'So she was considering in her own mind (as well as she could, for the\n',
 'hot day made her feel very sleepy and stupid), whether the pleasure\n',
 'of making a daisy-chain would be worth the trouble of getting up and\n',
 'picking the daisies, when suddenly a White Rabbit with pink eyes ran\n',
 'close by her.\n',
 '\n',
 'There was nothing so VERY remarkable in that; nor did Alice think it so\n',
 "VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!\n",
 "Oh dear! I shall be late!' (when she thought it over afterwards, it\n",
 'occurred to her that she ought to have wonder

### Data preprocessing


In [150]:
# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
print(corpus)
# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus



In [151]:
n_samples, V

(27165, 2557)

In [152]:
corpus

[[305, 7, 38, 1, 92, 595],
 [11, 13, 253, 3, 106, 30, 470, 8, 342, 76, 16, 379, 20, 1],
 [828, 2, 8, 343, 136, 3, 54, 134, 57, 596, 6, 23, 829, 65, 1],
 [323, 16, 379, 13, 830, 24, 5, 23, 45, 683, 57, 1447, 12],
 [5, 2, 31, 36, 1, 212, 8, 4, 323, 59, 11, 170, 683, 57],
 [27, 6, 13, 831, 12, 16, 344, 324, 15, 70, 15, 6, 58, 25, 1],
 [471, 160, 154, 16, 415, 30, 597, 2, 529, 325, 1, 1049],
 [8, 416, 4, 1448, 1449, 49, 28, 684, 1, 530, 8, 188, 39, 2],
 [1050, 1, 1450, 56, 279, 4, 148, 92, 22, 1451, 155, 228],
 [280, 76, 16],
 [40, 13, 136, 27, 30, 1051, 12, 14, 832, 67, 11, 89, 5, 27],
 [30, 93, 35, 8, 1, 83, 3, 254, 1, 92, 96, 3, 255, 108, 156],
 [108, 156, 7, 173, 28, 531, 56, 6, 59, 5, 124, 1052, 5],
 [1053, 3, 16, 14, 6, 256, 3, 55, 1452, 18, 32, 24, 18, 1, 62],
 [5, 21, 164, 86, 685, 24, 56, 1, 92, 1453, 180, 4, 417],
 [35, 8, 78, 1054, 472, 2, 109, 18, 5, 2, 43, 345, 20],
 [11, 1055, 3, 16, 204, 25, 5, 1454, 598, 16, 324, 14, 6, 23],
 [103, 128, 238, 4, 92, 22, 346, 4, 1054, 472, 57

In [153]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items())))



In [154]:
# Parameters
window_size = 2 
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)

## CBOW


In [155]:
from keras.preprocessing import sequence

# Prepare the data for the CBOW model
def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words 
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))
                 
    return (np.array(all_in), np.array(all_out))

In [156]:
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
print(list((tokenizer.word_index.items())))
print('X_cbow = ',X_cbow,'\ny_cbow = ',y_cbow)
print('V = ',V)

X_cbow =  [[   0    0    7   38]
 [   0  305   38    1]
 [ 305    7    1   92]
 ...
 [2556 1426    0    0]
 [   0    0  215    0]
 [   0    1    0    0]] 
y_cbow =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
V =  2557


In [157]:
# Create the CBOW architecture
dim = 2

#for dim in dims:
cbow = Sequential()

    # Add an Embedding layer
cbow.add(Embedding(input_dim=V, 
                   output_dim=dim,
                   input_length=window_size*2, # Note that we now have 2L words for each input entry
                   embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
             loss='categorical_crossentropy',
             metrics=['accuracy'])
    
cbow.summary()
print("")
#cbow_models.append(cbow)
#cbow.get_weights()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 4, 2)              5114      
_________________________________________________________________
lambda_4 (Lambda)            (None, 2)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 2557)              7671      
Total params: 12,785
Trainable params: 12,785
Non-trainable params: 0
_________________________________________________________________



In [158]:
# Train CBOW model
cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
print("")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



In [159]:
# Save embeddings for vectors of length 50, 150 and 300 using cbow model
weights = cbow.get_weights()

# Get the embedding matrix
embedding = weights[0]

weights



[array([[-0.7773403 , -0.54967093],
        [-2.3244038 ,  3.7435513 ],
        [ 1.2009221 , -1.07518   ],
        ...,
        [ 0.81974286,  0.28047928],
        [ 0.27302358, -0.37252426],
        [-0.32015204, -0.5915691 ]], dtype=float32),
 array([[ 3.212376 , -1.6388285, -1.0049804, ...,  1.6985162,  1.4610418,
          2.3176613],
        [ 2.614501 , -4.2542486, -1.1778097, ...,  1.4696827,  1.1388681,
          2.8579006]], dtype=float32),
 array([-5.685914  ,  0.01002352,  2.6682637 , ..., -0.85780597,
        -1.0412377 , -0.4775079 ], dtype=float32)]

In [160]:
words = list((tokenizer.word_index.items()))
words.insert(0,('unkown',0))
words

[('unkown', 0),
 ('the', 1),
 ('and', 2),
 ('to', 3),
 ('a', 4),
 ('it', 5),
 ('she', 6),
 ('i', 7),
 ('of', 8),
 ('said', 9),
 ('you', 10),
 ('alice', 11),
 ('in', 12),
 ('was', 13),
 ('that', 14),
 ('as', 15),
 ('her', 16),
 ('t', 17),
 ('at', 18),
 ('s', 19),
 ('on', 20),
 ('all', 21),
 ('with', 22),
 ('had', 23),
 ('but', 24),
 ('for', 25),
 ('they', 26),
 ('so', 27),
 ('be', 28),
 ('not', 29),
 ('very', 30),
 ('what', 31),
 ('this', 32),
 ('little', 33),
 ('he', 34),
 ('out', 35),
 ('is', 36),
 ('one', 37),
 ('down', 38),
 ('up', 39),
 ('there', 40),
 ('if', 41),
 ('his', 42),
 ('then', 43),
 ('about', 44),
 ('no', 45),
 ('them', 46),
 ('like', 47),
 ('were', 48),
 ('would', 49),
 ('herself', 50),
 ('know', 51),
 ('went', 52),
 ('again', 53),
 ('do', 54),
 ('have', 55),
 ('when', 56),
 ('or', 57),
 ('could', 58),
 ('thought', 59),
 ('off', 60),
 ('queen', 61),
 ('time', 62),
 ('how', 63),
 ('me', 64),
 ('into', 65),
 ('see', 66),
 ('did', 67),
 ('who', 68),
 ('can', 69),
 ('well',

In [161]:
embedding

array([[-0.7773403 , -0.54967093],
       [-2.3244038 ,  3.7435513 ],
       [ 1.2009221 , -1.07518   ],
       ...,
       [ 0.81974286,  0.28047928],
       [ 0.27302358, -0.37252426],
       [-0.32015204, -0.5915691 ]], dtype=float32)

In [None]:
fig , ax = plt.subplots(1,1,figsize = (10,10))
for i in range(len(embedding)):
  plt.scatter(embedding[i][0],embedding[i][1]) 
  ax.annotate(words[i][0],(embedding[i][0] + 0.01,embedding[i][1] + 0.01))
plt.show()

In [163]:
def embed(word, embedding, vocab_size=V, tokenizer=tokenizer):
    int_word = tokenizer.texts_to_sequences([word])[0]
    bin_word = to_categorical(int_word, V)
    return np.dot(bin_word, embedding)

In [165]:
embed('alice',embedding)

array([[-1.6311936 ,  0.44723874]], dtype=float32)

In [166]:
def compute_distance(word_a, word_b):
    
    point1 = embed(word_a,embedding)
    point2 = embed(word_b,embedding)
    
    return np.linalg.norm(point1 - point2)

In [167]:
compute_distance('king','queen')

0.89911604

In [169]:
def know_word(y_cbow):
  for i in range(len(y_cbow)):
    if y_cbow[i] == 1 :
      break
  return words[i][0]  

In [170]:
know_word(y_cbow[0])

'chapter'

In [171]:
def output(input):
  input_tokens = tokenizer.texts_to_sequences(input)
  for i in range(len(cbow.predict(input_tokens)[0])):
    if cbow.predict(input_tokens)[0][i] == cbow.predict(input_tokens)[0].max() :
      break
  return words[i][0]

In [172]:
input_words = [input("Enter the neighbouring words (separated by spaces) :")]

Enter the neighbouring words (separated by spaces) :chapter i the rabbit


In [173]:
input_words

['chapter i the rabbit']

In [174]:
output(input_words)

'of'