In [1]:
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
import gensim

In [2]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
remove_terms = punctuation + '0123456789'

In [4]:
def preprocessing(text):
    words = word_tokenize(text)
    tokens = [w.lower() for w in words if w.lower() not in remove_terms]
    stopw = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopw]
    tokens = [word for word in tokens if len(word) >= 3]
    tokens = [word for word in tokens if word.isalpha()]
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [5]:
corpus = open('History_of_Astronomy.txt', encoding='utf8').readlines()

In [6]:
corpus = [preprocessing(sentence) for sentence in corpus if sentence.strip() !='']

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [8]:
X_train_tokens = tokenizer.texts_to_sequences(corpus)

In [9]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

4212

In [10]:
items = tokenizer.word_index.items()

In [11]:
dim_embeddings = 300

#inputs
inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(input_dim=vocab_size, output_dim=dim_embeddings)(inputs)

#context
c_inputs = Input(shape=(1, ), dtype='int32')
c = Embedding(input_dim=vocab_size, output_dim=dim_embeddings)(c_inputs)

d = Dot(axes=2)([w, c])
d = Reshape((1,), input_shape=(1, 1))(d)
d = Activation('sigmoid')(d)

model = Model(inputs=[inputs, c_inputs], outputs=d)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 300)       1263600     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 300)       1263600     input_2[0][0]                    
_______________________________________________________________________________________

In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam')

In [14]:
n_epochs = 15
for epoch in range(n_epochs):
    loss = 0.
    for i, doc in enumerate(X_train_tokens):
        data, labels = skipgrams(sequence=doc, vocabulary_size=vocab_size, window_size=4)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += model.train_on_batch(x, y)

    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 0 	Loss: 2338.1833555549383
Epoch: 1 	Loss: 1900.0107424929738
Epoch: 2 	Loss: 1609.4854783304036
Epoch: 3 	Loss: 1321.2070938460529
Epoch: 4 	Loss: 1064.508568521589
Epoch: 5 	Loss: 858.0785472672433
Epoch: 6 	Loss: 700.4538713130169
Epoch: 7 	Loss: 574.8111935441848
Epoch: 8 	Loss: 502.353528737789
Epoch: 9 	Loss: 438.83482115293737
Epoch: 10 	Loss: 402.5674714980123
Epoch: 11 	Loss: 368.22985522680756
Epoch: 12 	Loss: 348.0301035082739
Epoch: 13 	Loss: 322.16826397892146
Epoch: 14 	Loss: 310.69731902621425


In [18]:
f = open('word2vec-skipgrams1.txt' ,'w', encoding="utf8")
f.write('{} {}\n'.format(vocab_size-1, dim_embeddings))

weights = model.get_weights()[0]
for word, i in items:
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(weights[i, :])))))
f.close()

In [20]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('word2vec-skipgrams1.txt', binary=False)

In [24]:
w2v.most_similar(positive=['system'])

[('substitute', 0.5043259263038635),
 ('advocated', 0.4944961667060852),
 ('physically', 0.486220121383667),
 ('recommend', 0.47913214564323425),
 ('pythagorean', 0.47744858264923096),
 ('relationship', 0.46811461448669434),
 ('unfit', 0.46585965156555176),
 ('simplification', 0.4655781686306),
 ('prospect', 0.46493586897850037),
 ('supposes', 0.4534285068511963)]