## CBOW in Keras

### Continuous Bag of Words

In [1]:
import numpy as np

import keras.backend as K
from keras.models import Sequential
from keras.layers import *
from keras.utils import np_utils
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import gensim
import nltk

from IPython.display import SVG

Using TensorFlow backend.


In [2]:
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\EZEN\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EZEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.corpus.gutenberg.sents("carroll-alice.txt")[10]

['The',
 'rabbit',
 '-',
 'hole',
 'went',
 'straight',
 'on',
 'like',
 'a',
 'tunnel',
 'for',
 'some',
 'way',
 ',',
 'and',
 'then',
 'dipped',
 'suddenly',
 'down',
 ',',
 'so',
 'suddenly',
 'that',
 'Alice',
 'had',
 'not',
 'a',
 'moment',
 'to',
 'think',
 'about',
 'stopping',
 'herself',
 'before',
 'she',
 'found',
 'herself',
 'falling',
 'down',
 'a',
 'very',
 'deep',
 'well',
 '.']

In [4]:
sentents = [" ".join(s) for s in nltk.corpus.gutenberg.sents("carroll-alice.txt") if len(s) > 2]

In [5]:
sentents[10]

'The rabbit - hole went straight on like a tunnel for some way , and then dipped suddenly down , so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well .'

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentents)
corpus = tokenizer.texts_to_sequences(sentents)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2

In [7]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels = []
            s = index - window_size
            e = index + window_size + 1

            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [8]:
X = []
Y = []
for x, y in generate_data(corpus, window_size, V):
    X.append(x)
    Y.append(y)

X = np.concatenate(X)
Y = np.concatenate(Y)

In [10]:
X.shape, Y.shape

((30179, 4), (30179, 2572))

In [11]:
X[2], np.nonzero(Y[2])

(array([ 12,   1, 475,  13]), (array([20], dtype=int64),))

In [12]:
X[3], np.nonzero(Y[3])

(array([  1,  20,  13, 831]), (array([475], dtype=int64),))

## Model

In [13]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))
cbow.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            257200    
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2572)              259772    
Total params: 516,972
Trainable params: 516,972
Non-trainable params: 0
_________________________________________________________________


In [14]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=["accuracy"])

In [15]:
%time hist = cbow.fit(X, Y, epochs=1000, batch_size=100, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1000
 - 3s - loss: 7.4485 - accuracy: 0.0951
Epoch 2/1000
 - 3s - loss: 6.5731 - accuracy: 0.1032
Epoch 3/1000
 - 3s - loss: 6.1440 - accuracy: 0.1199
Epoch 4/1000
 - 3s - loss: 5.9253 - accuracy: 0.1357
Epoch 5/1000
 - 3s - loss: 5.7835 - accuracy: 0.1434
Epoch 6/1000
 - 3s - loss: 5.6789 - accuracy: 0.1472
Epoch 7/1000
 - 3s - loss: 5.5950 - accuracy: 0.1507
Epoch 8/1000
 - 3s - loss: 5.5226 - accuracy: 0.1535
Epoch 9/1000
 - 3s - loss: 5.4573 - accuracy: 0.1575
Epoch 10/1000
 - 3s - loss: 5.3982 - accuracy: 0.1625
Epoch 11/1000
 - 3s - loss: 5.3430 - accuracy: 0.1683
Epoch 12/1000
 - 3s - loss: 5.2913 - accuracy: 0.1739
Epoch 13/1000
 - 3s - loss: 5.2428 - accuracy: 0.1796
Epoch 14/1000
 - 3s - loss: 5.1959 - accuracy: 0.1848
Epoch 15/1000
 - 3s - loss: 5.1519 - accuracy: 0.1874
Epoch 16/1000
 - 3s - loss: 5.1100 - accuracy: 0.1922
Epoch 17/1000
 - 3s - loss: 5.0702 - accuracy: 0.1963
Epoch 18/1000
 - 3s - loss: 5.0319 - accuracy: 0.1996
Epoch 19/1000
 - 3s - loss: 4.9958 - 

Epoch 152/1000
 - 3s - loss: 3.1982 - accuracy: 0.3961
Epoch 153/1000
 - 3s - loss: 3.1902 - accuracy: 0.3968
Epoch 154/1000
 - 3s - loss: 3.1827 - accuracy: 0.3977
Epoch 155/1000
 - 3s - loss: 3.1756 - accuracy: 0.3994
Epoch 156/1000
 - 3s - loss: 3.1679 - accuracy: 0.4000
Epoch 157/1000
 - 3s - loss: 3.1605 - accuracy: 0.4011
Epoch 158/1000
 - 3s - loss: 3.1532 - accuracy: 0.4029
Epoch 159/1000
 - 3s - loss: 3.1461 - accuracy: 0.4032
Epoch 160/1000
 - 3s - loss: 3.1384 - accuracy: 0.4036
Epoch 161/1000
 - 3s - loss: 3.1305 - accuracy: 0.4058
Epoch 162/1000
 - 3s - loss: 3.1234 - accuracy: 0.4056
Epoch 163/1000
 - 3s - loss: 3.1156 - accuracy: 0.4058
Epoch 164/1000
 - 3s - loss: 3.1086 - accuracy: 0.4067
Epoch 165/1000
 - 3s - loss: 3.1020 - accuracy: 0.4085
Epoch 166/1000
 - 3s - loss: 3.0946 - accuracy: 0.4098
Epoch 167/1000
 - 3s - loss: 3.0872 - accuracy: 0.4090
Epoch 168/1000
 - 3s - loss: 3.0802 - accuracy: 0.4116
Epoch 169/1000
 - 3s - loss: 3.0730 - accuracy: 0.4114
Epoch 170/

Epoch 301/1000
 - 3s - loss: 2.3624 - accuracy: 0.5114
Epoch 302/1000
 - 3s - loss: 2.3582 - accuracy: 0.5129
Epoch 303/1000
 - 3s - loss: 2.3547 - accuracy: 0.5139
Epoch 304/1000
 - 3s - loss: 2.3501 - accuracy: 0.5153
Epoch 305/1000
 - 3s - loss: 2.3469 - accuracy: 0.5150
Epoch 306/1000
 - 3s - loss: 2.3424 - accuracy: 0.5166
Epoch 307/1000
 - 3s - loss: 2.3384 - accuracy: 0.5157
Epoch 308/1000
 - 3s - loss: 2.3347 - accuracy: 0.5169
Epoch 309/1000
 - 3s - loss: 2.3308 - accuracy: 0.5180
Epoch 310/1000
 - 3s - loss: 2.3275 - accuracy: 0.5186
Epoch 311/1000
 - 3s - loss: 2.3227 - accuracy: 0.5197
Epoch 312/1000
 - 3s - loss: 2.3192 - accuracy: 0.5202
Epoch 313/1000
 - 3s - loss: 2.3159 - accuracy: 0.5192
Epoch 314/1000
 - 3s - loss: 2.3114 - accuracy: 0.5213
Epoch 315/1000
 - 3s - loss: 2.3075 - accuracy: 0.5204
Epoch 316/1000
 - 3s - loss: 2.3043 - accuracy: 0.5211
Epoch 317/1000
 - 3s - loss: 2.3006 - accuracy: 0.5212
Epoch 318/1000
 - 3s - loss: 2.2966 - accuracy: 0.5219
Epoch 319/

Epoch 450/1000
 - 3s - loss: 1.9175 - accuracy: 0.5837
Epoch 451/1000
 - 3s - loss: 1.9154 - accuracy: 0.5854
Epoch 452/1000
 - 3s - loss: 1.9129 - accuracy: 0.5865
Epoch 453/1000
 - 3s - loss: 1.9113 - accuracy: 0.5870
Epoch 454/1000
 - 3s - loss: 1.9092 - accuracy: 0.5872
Epoch 455/1000
 - 3s - loss: 1.9075 - accuracy: 0.5867
Epoch 456/1000
 - 3s - loss: 1.9049 - accuracy: 0.5857
Epoch 457/1000
 - 3s - loss: 1.9025 - accuracy: 0.5887
Epoch 458/1000
 - 3s - loss: 1.9012 - accuracy: 0.5872
Epoch 459/1000
 - 3s - loss: 1.8983 - accuracy: 0.5880
Epoch 460/1000
 - 3s - loss: 1.8961 - accuracy: 0.5898
Epoch 461/1000
 - 3s - loss: 1.8945 - accuracy: 0.5878
Epoch 462/1000
 - 3s - loss: 1.8921 - accuracy: 0.5909
Epoch 463/1000
 - 3s - loss: 1.8904 - accuracy: 0.5900
Epoch 464/1000
 - 3s - loss: 1.8876 - accuracy: 0.5909
Epoch 465/1000
 - 3s - loss: 1.8859 - accuracy: 0.5914
Epoch 466/1000
 - 3s - loss: 1.8836 - accuracy: 0.5913
Epoch 467/1000
 - 3s - loss: 1.8826 - accuracy: 0.5915
Epoch 468/

Epoch 599/1000
 - 3s - loss: 1.6691 - accuracy: 0.6302
Epoch 600/1000
 - 3s - loss: 1.6672 - accuracy: 0.6318
Epoch 601/1000
 - 3s - loss: 1.6662 - accuracy: 0.6312
Epoch 602/1000
 - 3s - loss: 1.6640 - accuracy: 0.6321
Epoch 603/1000
 - 3s - loss: 1.6626 - accuracy: 0.6322
Epoch 604/1000
 - 3s - loss: 1.6622 - accuracy: 0.6320
Epoch 605/1000
 - 3s - loss: 1.6604 - accuracy: 0.6322
Epoch 606/1000
 - 3s - loss: 1.6597 - accuracy: 0.6324
Epoch 607/1000
 - 3s - loss: 1.6581 - accuracy: 0.6329
Epoch 608/1000
 - 3s - loss: 1.6571 - accuracy: 0.6332
Epoch 609/1000
 - 3s - loss: 1.6563 - accuracy: 0.6326
Epoch 610/1000
 - 3s - loss: 1.6540 - accuracy: 0.6328
Epoch 611/1000
 - 3s - loss: 1.6528 - accuracy: 0.6341
Epoch 612/1000
 - 3s - loss: 1.6520 - accuracy: 0.6350
Epoch 613/1000
 - 3s - loss: 1.6507 - accuracy: 0.6352
Epoch 614/1000
 - 3s - loss: 1.6487 - accuracy: 0.6351
Epoch 615/1000
 - 3s - loss: 1.6479 - accuracy: 0.6347
Epoch 616/1000
 - 3s - loss: 1.6467 - accuracy: 0.6353
Epoch 617/

Epoch 748/1000
 - 3s - loss: 1.5110 - accuracy: 0.6621
Epoch 749/1000
 - 3s - loss: 1.5099 - accuracy: 0.6630
Epoch 750/1000
 - 3s - loss: 1.5088 - accuracy: 0.6630
Epoch 751/1000
 - 3s - loss: 1.5083 - accuracy: 0.6625
Epoch 752/1000
 - 3s - loss: 1.5065 - accuracy: 0.6640
Epoch 753/1000
 - 3s - loss: 1.5068 - accuracy: 0.6643
Epoch 754/1000
 - 3s - loss: 1.5052 - accuracy: 0.6640
Epoch 755/1000
 - 3s - loss: 1.5048 - accuracy: 0.6634
Epoch 756/1000
 - 3s - loss: 1.5035 - accuracy: 0.6634
Epoch 757/1000
 - 3s - loss: 1.5034 - accuracy: 0.6645
Epoch 758/1000
 - 3s - loss: 1.5015 - accuracy: 0.6638
Epoch 759/1000
 - 3s - loss: 1.5014 - accuracy: 0.6649
Epoch 760/1000
 - 3s - loss: 1.4998 - accuracy: 0.6641
Epoch 761/1000
 - 3s - loss: 1.5001 - accuracy: 0.6659
Epoch 762/1000
 - 3s - loss: 1.4986 - accuracy: 0.6642
Epoch 763/1000
 - 3s - loss: 1.4978 - accuracy: 0.6656
Epoch 764/1000
 - 3s - loss: 1.4971 - accuracy: 0.6655
Epoch 765/1000
 - 3s - loss: 1.4967 - accuracy: 0.6661
Epoch 766/

Epoch 897/1000
 - 3s - loss: 1.4002 - accuracy: 0.6855
Epoch 898/1000
 - 3s - loss: 1.3998 - accuracy: 0.6862
Epoch 899/1000
 - 3s - loss: 1.3986 - accuracy: 0.6856
Epoch 900/1000
 - 3s - loss: 1.3981 - accuracy: 0.6857
Epoch 901/1000
 - 3s - loss: 1.3975 - accuracy: 0.6854
Epoch 902/1000
 - 3s - loss: 1.3973 - accuracy: 0.6846
Epoch 903/1000
 - 3s - loss: 1.3967 - accuracy: 0.6849
Epoch 904/1000
 - 3s - loss: 1.3960 - accuracy: 0.6847
Epoch 905/1000
 - 3s - loss: 1.3958 - accuracy: 0.6846
Epoch 906/1000
 - 3s - loss: 1.3950 - accuracy: 0.6852
Epoch 907/1000
 - 3s - loss: 1.3940 - accuracy: 0.6855
Epoch 908/1000
 - 3s - loss: 1.3934 - accuracy: 0.6853
Epoch 909/1000
 - 3s - loss: 1.3928 - accuracy: 0.6867
Epoch 910/1000
 - 3s - loss: 1.3921 - accuracy: 0.6845
Epoch 911/1000
 - 3s - loss: 1.3917 - accuracy: 0.6850
Epoch 912/1000
 - 3s - loss: 1.3909 - accuracy: 0.6870
Epoch 913/1000
 - 3s - loss: 1.3903 - accuracy: 0.6865
Epoch 914/1000
 - 3s - loss: 1.3893 - accuracy: 0.6870
Epoch 915/

In [16]:
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
plt.figure(figsize=(10, 6))
plt.plot(hist.history['acc'])
plt.show()

KeyError: 'acc'

<Figure size 720x432 with 0 Axes>