In [1]:
# !pip install keras
# !pip install tensorflow

In [2]:
import numpy as np
np.random.seed(13)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import gensim

Using TensorFlow backend.


In [3]:
path=get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path).readlines()[:300]
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2


In [4]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [5]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [6]:
cbow.layers

[<keras.layers.embeddings.Embedding at 0x7fed6f1accf8>,
 <keras.layers.core.Lambda at 0x7fed6f1acb38>,
 <keras.layers.core.Dense at 0x7fed6f1acf60>]

In [7]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [8]:
corpus[:5]

[[348, 349, 65, 115, 10, 116, 57, 196, 197],
 [22, 117, 66, 17, 1, 67, 8, 350, 351, 29, 38, 352, 4, 27],
 [146, 38, 353, 354, 16, 355, 356, 5, 198, 5, 147, 32],
 [357, 67, 5, 118, 1, 358, 8, 1, 199, 148, 359, 360],
 [27, 22, 117, 32, 361, 29, 362, 148, 363]]

In [9]:
for ite in range(10):
    loss = 0.
    for i, (x, y) in enumerate(generate_data(corpus, window_size, V)):
        if i<4:
            print('', i, x, y.shape)
        loss += cbow.train_on_batch(x, y)
    print(i)

    print(ite, loss)

 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
0 17412.5700237751
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
1 16152.255005836487
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
2 16024.153420567513
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
3 15933.20086157322
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
4 15825.323416352272
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 115  10]] (1, 765)
 3 [[349  65  10 116]] (1, 765)
2786
5 15722.679160535336
 0 [[  0   0 349  65]] (1, 765)
 1 [[  0 348  65 115]] (1, 765)
 2 [[348 349 11

In [15]:
f = open('vectors-cbow.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim))

8

In [20]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

ValueError: I/O operation on closed file.

In [17]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors-cbow.txt', binary=False)

In [18]:
w2v.most_similar(positive=['the'])

[('a', 0.6486876010894775),
 ('one', 0.6459566354751587),
 ('this', 0.5973973274230957),
 ('adventures', 0.5596767663955688),
 ('alice’s', 0.5595484972000122),
 ('any', 0.5506154298782349),
 ('those', 0.5317624807357788),
 ('miles', 0.5270207524299622),
 ('help', 0.5243602991104126),
 ('no', 0.5234998464584351)]

In [19]:
w2v.most_similar(positive=['alice'])

[('she', 0.6102245450019836),
 ('you', 0.6003879904747009),
 ('poor', 0.5682523250579834),
 ('them', 0.5612143874168396),
 ('that', 0.5536033511161804),
 ('eat', 0.5530521869659424),
 ('now', 0.5518888235092163),
 ('dark', 0.548552393913269),
 ('marked', 0.548235297203064),
 ('thought', 0.5456331968307495)]