In [1]:
# Import the Tokenizer and pad_sequences directly from tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence
# Import to_categorical for handling categorical data
from tensorflow.keras.utils import to_categorical

# Keras backend and layers
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

# For distance calculation and data handling
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import pandas as pd


In [2]:
data = """Deep learning (also known as deep structured learning) is part of a
broader family of machine learning methods based on artificial neural networks
with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks,
deep reinforcement learning, recurrent neural networks, convolutional neural networks and
Transformers have been applied to fields including computer vision, speech recognition,
natural language processing, machine translation, bioinformatics, drug design,
medical image analysis, climate science, material inspection and board game programs,
where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

In [3]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id=tokenizer.word_index
word2id['PAD']=0
id2word={i:w for w,i in word2id.items()}


In [4]:
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

In [5]:
vocab_size=len(word2id)
embed_size=100
window_size=2

print("Vocabulary size: ", vocab_size)
print("Vocabulary Sample: ", list(word2id.items())[:10])

Vocabulary size:  75
Vocabulary Sample:  [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [6]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i] for i in range(start, end) if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            x=sequence.pad_sequences(context_words, maxlen=context_length)
            y=to_categorical(label_word, vocab_size)
            yield (x,y)


In [7]:
cbow=Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x:K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))


cbow.compile(loss="categorical_crossentropy", optimizer="rmsprop")
print(cbow.summary())



None


In [None]:
for epoch in range(1,6):
    loss=0
    i=0

    for x,y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i+=1
        loss+=cbow.train_on_batch(x,y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:',epoch, '\tLoss:', loss)
    print()



In [None]:
weights=cbow.get_weights()[0]
weights=weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()
#

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,0.000179,0.024392,-0.03716,-0.007323,0.052009,0.022948,0.046257,-0.007056,-0.056206,0.029087,...,-0.000142,0.054809,0.022008,-0.017402,0.04908,0.013851,-0.030178,-0.005728,-0.002658,-0.03476
networks,0.016809,0.03924,-0.031243,-0.033892,-0.000368,0.037413,-0.002513,-0.001464,-0.001121,-0.056755,...,-0.017674,-0.003681,0.019388,0.004313,0.017115,-0.030052,-0.04515,0.020255,0.03388,-0.024783
neural,-0.015213,-0.002998,-0.039364,0.015016,0.023299,-0.005014,-0.019608,-0.011861,0.003096,-0.027352,...,0.001805,0.042904,0.013278,-0.033879,-0.010246,0.031227,0.022609,0.047982,0.001177,-0.006467
and,0.01677,0.000866,0.008822,-0.042911,-0.039633,0.043032,0.039989,0.027244,0.002577,0.015698,...,0.000102,-0.032017,-0.04106,0.041263,0.019994,0.031377,0.046556,-0.018282,-0.03818,0.007129
as,-0.003158,-0.043536,0.036947,0.009929,0.041942,0.02984,0.031058,-0.034985,0.040572,0.042722,...,-0.036833,-0.017574,0.041022,-0.013334,-0.012512,0.002698,-0.041016,-0.029164,-0.015737,0.044603


In [None]:
distance_matrix=euclidean_distances(weights)
print(distance_matrix.shape)
inwords =input()
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[0:6]]
                   for search_term in {inwords}}

similar_words


(74, 74)
learning


{'learning': ['PAD', 'based', 'neural', 'translation', 'in', 'language']}