In [1]:
import numpy as np
np.random.seed(13)

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

import gensim

Using Theano backend.


In [2]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path).readlines()

corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
V = len(tokenizer.word_index) + 1
V

3388

In [3]:
dim_embedddings = 128

# inputs
w_inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(V, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(V, dim_embedddings)(c_inputs)
o = Dot(axes=2)([w, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 128)        433664      input_1[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 128)        433664      input_2[0][0]                    
___________________________________________________________________________________________

In [4]:
for _ in range(5):
    loss = 0.
    for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += SkipGram.train_on_batch(x, y)

    print(loss)


1078.05439098
741.469174184
689.020133168
661.459642701
638.09191246


In [5]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [6]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [7]:
w2v.most_similar(positive=['queen'])

[('hearts', 0.7544310092926025),
 ('king', 0.733845055103302),
 ('wildly', 0.6959143280982971),
 ('verse', 0.6711217164993286),
 ('turning', 0.6667185425758362),
 ('taken', 0.6665735244750977),
 ('cook', 0.660643458366394),
 ('trumpet', 0.62113356590271),
 ('repeated', 0.6186539530754089),
 ('pointing', 0.6166180968284607)]

In [8]:
w2v.most_similar(positive=['alice'])

[('thought', 0.6293938755989075),
 ('poor', 0.5651217103004456),
 ('glad', 0.5522480607032776),
 ('‘i’m', 0.5497287511825562),
 ('rather', 0.5482580065727234),
 ('feeling', 0.5272138118743896),
 ('‘yes', 0.5177067518234253),
 ('indeed', 0.516873836517334),
 ('pleaded', 0.515944242477417),
 ('hasn’t', 0.5104849338531494)]

In [9]:
w2v.most_similar(positive=['the'])

[('queen', 0.5757458806037903),
 ('king', 0.5742576718330383),
 ('hearts', 0.571501612663269),
 ('outside', 0.5713008642196655),
 ('verse', 0.5598728060722351),
 ('status', 0.5433791279792786),
 ('check', 0.5398269891738892),
 ('‘call', 0.5262266993522644),
 ('u', 0.5262154340744019),
 ('mock', 0.5253645181655884)]