In [3]:
import tensorflow as tf

In [8]:
import numpy as np
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.python.keras.layers.merge import Dot
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.keras.utils.data_utils import get_file
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import skipgrams
import gensim

In [10]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

In [11]:
remove_terms = punctuation + '0123456789'

In [28]:
def preprocessing(text):
    words = word_tokenize(text)
    tokens = [w for w in words if w.lower() not in remove_terms]
    # stopw = stopwords.words('english')
    # tokens = [token for token in tokens if token not in stopw]
    # remove words less than three letters
    # token = [word for word in tokens if len(word) >= 3]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    #lemmanize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vorme\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vorme\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

## Downloading Book

In [14]:
path = get_file('cosmos.txt',origin='http://gutenberg.org/files/8172/8172-0.txt')
# or 
# corpus = open('historyOfAstronomy.txt',encoding='utf8').readlines()

Downloading data from http://gutenberg.org/files/8172/8172-0.txt


In [19]:
corpus = open('historyOfAstronomy.txt',encoding='utf8').readlines()

In [33]:
corpus = [preprocessing(t) for t in corpus if t.strip() != '']

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [38]:
X_train_tokens = tokenizer.texts_to_sequences(corpus)

In [39]:
vocab_size = len(tokenizer.word_index) + 1

In [41]:
len(tokenizer.word_index)+1

4723

In [42]:
items = tokenizer.word_index.items()

In [48]:
items

dict_items([('the', 1), ('of', 2), ('and', 3), ('a', 4), ('to', 5), ('in', 6), ('that', 7), ('by', 8), ('it', 9), ('wa', 10), ('is', 11), ('he', 12), ('s', 13), ('be', 14), ('with', 15), ('this', 16), ('for', 17), ('his', 18), ('at', 19), ('on', 20), ('which', 21), ('from', 22), ('star', 23), ('been', 24), ('or', 25), ('have', 26), ('not', 27), ('but', 28), ('sun', 29), ('these', 30), ('ha', 31), ('all', 32), ('an', 33), ('motion', 34), ('are', 35), ('work', 36), ('observation', 37), ('were', 38), ('earth', 39), ('planet', 40), ('one', 41), ('had', 42), ('found', 43), ('year', 44), ('great', 45), ('time', 46), ('moon', 47), ('their', 48), ('so', 49), ('line', 50), ('project', 51), ('orbit', 52), ('law', 53), ('when', 54), ('made', 55), ('we', 56), ('no', 57), ('about', 58), ('other', 59), ('theory', 60), ('solar', 61), ('same', 62), ('any', 63), ('also', 64), ('first', 65), ('astronomy', 66), ('you', 67), ('comet', 68), ('new', 69), ('discovery', 70), ('there', 71), ('if', 72), ('more'

# Creating Word2Vec Model

In [54]:
dim_embeddings = 300

# inputs
inputs = Input(shape=(1,),dtype='int32')
w = Embedding(vocab_size,dim_embeddings)(inputs)

#context
c_inputs = Input(shape=(1,),dtype='int32')
c = Embedding(vocab_size,dim_embeddings)(c_inputs)

# compute similarity
d = Dot(axes=2)([w,c])

d = Reshape((1,), input_shape=(1,1))(d)
d = Activation('sigmoid')(d)

model = Model(inputs=[inputs,c_inputs],outputs = d)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 300)       1416900     input_7[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 300)       1416900     input_8[0][0]                    
_______________________________________________________________________________________

In [55]:
model.compile(loss='binary_crossentropy',optimizer='adam')

In [58]:
n_epochs = 15
for epoch in range(n_epochs):
    loss = 0
    for i, doc in enumerate(X_train_tokens):
        data, labels = skipgrams(sequence=doc,vocabulary_size=vocab_size,window_size=4)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += model.train_on_batch(x,y)
            
    print('Epoch', epoch, '\tLoss', loss)

Epoch 0 	Loss 1781.4936638064682
Epoch 1 	Loss 1588.9447486773133
Epoch 2 	Loss 1439.4026970062405
Epoch 3 	Loss 1311.4078500103205
Epoch 4 	Loss 1191.7090303339064
Epoch 5 	Loss 1091.5435202135704
Epoch 6 	Loss 1013.4644258145709
Epoch 7 	Loss 937.1618440927705
Epoch 8 	Loss 880.9855245880317
Epoch 9 	Loss 844.418281206541
Epoch 10 	Loss 800.4492275445955
Epoch 11 	Loss 778.2931408962686
Epoch 12 	Loss 750.221392060339
Epoch 13 	Loss 732.5436652666904
Epoch 14 	Loss 723.0847705991328


In [59]:
data

[[4722, 316],
 [69, 854],
 [4721, 2726],
 [5, 1338],
 [2693, 4720],
 [4720, 2023],
 [854, 1511],
 [4722, 5],
 [5, 1420],
 [58, 962],
 [119, 5],
 [4722, 872],
 [119, 4088],
 [5, 561],
 [69, 4721],
 [69, 58],
 [69, 5],
 [4722, 4721],
 [58, 854],
 [5, 2693],
 [4722, 119],
 [4721, 4720],
 [4722, 854],
 [2693, 119],
 [2693, 422],
 [854, 1856],
 [119, 5],
 [5, 119],
 [119, 2382],
 [2693, 4722],
 [4721, 4249],
 [2693, 2269],
 [4721, 2240],
 [2693, 4721],
 [4722, 58],
 [58, 5],
 [4722, 411],
 [119, 4720],
 [5, 4134],
 [2693, 3032],
 [58, 4722],
 [854, 69],
 [5, 4721],
 [58, 2596],
 [119, 455],
 [2693, 4609],
 [4722, 69],
 [4720, 2971],
 [854, 4627],
 [5, 4720],
 [119, 4722],
 [5, 4721],
 [5, 4217],
 [4721, 119],
 [5, 3406],
 [5, 4722],
 [119, 4722],
 [2693, 58],
 [69, 3360],
 [5, 408],
 [4720, 4721],
 [4721, 5],
 [58, 4231],
 [69, 2802],
 [4720, 5],
 [5, 69],
 [2693, 763],
 [5, 58],
 [4721, 2910],
 [119, 2693],
 [58, 4399],
 [854, 4722],
 [58, 3356],
 [4721, 2693],
 [4722, 952],
 [5, 119],
 [5

In [60]:
labels

[0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0]

## Saving model

In [74]:
f = open('word2vec-skipgrams.txt','w', encoding='utf-8')
f.write('{} {}\n'.format(vocab_size-1, dim_embeddings))

weights = model.get_weights()[0]
for word, i in items:
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(weights[i,:])))))
f.close()

## Loading Model

In [78]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('word2vec-skipgrams.txt', binary=False)

In [80]:
w2v.most_similar(positive=['solar'])

[('eclipses', 0.4622095823287964),
 ('dissolving', 0.4327881336212158),
 ('physics', 0.43065494298934937),
 ('researches', 0.4024451673030853),
 ('obverse', 0.37765634059906006),
 ('system', 0.363186240196228),
 ('hide', 0.3564595580101013),
 ('total', 0.3548147678375244),
 ('grazed', 0.347744345664978),
 ('intensified', 0.34372782707214355)]

In [81]:
w2v.most_similar(positive=['system'])

[('advocated', 0.4125906825065613),
 ('copernican', 0.3974606990814209),
 ('ptolemean', 0.380581796169281),
 ('complicated', 0.37141451239585876),
 ('binary', 0.3678063154220581),
 ('solar', 0.36318621039390564),
 ('propounded', 0.3622959852218628),
 ('observational', 0.3563804030418396),
 ('radical', 0.34301120042800903),
 ('turmoil', 0.3428192138671875)]

In [82]:
w2v.most_similar(positive=['kepler'])

[('johannes', 0.5228484869003296),
 ('wanderer', 0.5217564105987549),
 ('f', 0.5156950354576111),
 ('contemplated', 0.4745159149169922),
 ('reitlinger', 0.4489484429359436),
 ('surer', 0.4238871932029724),
 ('joy', 0.38644975423812866),
 ('leave', 0.38324227929115295),
 ('portrait', 0.37249815464019775),
 ('masculine', 0.36554208397865295)]