In [1]:
import numpy as np
np.random.seed(13)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import gensim

Using TensorFlow backend.


In [2]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/cache/epub/11/pg11.txt')
corpus = open(path).readlines()[0:200]
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2

In [3]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        contexts = []
        labels   = []
        L = len(words)
        for index, word in enumerate(words):
            s = index-window_size
            e = index+window_size+1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)

            yield (x, y)

In [4]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [5]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [6]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)
    print(ite, loss)

0 11435.5716109
1 10761.0359893
2 10330.3479202
3 10107.584079
4 9967.16930652
5 9866.72839475
6 9766.41511559
7 9654.16313839
8 9534.91830802
9 9416.34877527


In [7]:
f = open('vectors.txt' ,'w')
f.write(' '.join([str(V-1), str(dim)]))
f.write('\n')

1

In [8]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(' ')
    f.write(' '.join(map(str, list(vectors[i, :]))))
    f.write('\n')
f.close()

In [9]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [10]:
w2v.most_similar(positive=['alice'])

[('said', 0.5085229873657227),
 ("'poison", 0.5060866475105286),
 ('she', 0.4989706873893738),
 ('begun', 0.4741324782371521),
 ('taught', 0.45926445722579956),
 ('poor', 0.40908658504486084),
 ('happened', 0.3952333629131317),
 ('thought', 0.39319753646850586),
 ('several', 0.38720130920410156),
 ("'well", 0.36799049377441406)]