In [93]:
#https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-skip-gram.html

In [46]:
from keras.preprocessing import text
import numpy as np

In [2]:
txt = open("botchan.txt", "rb")

In [3]:
lines  = [i for i in txt.readlines()]

In [9]:
lines

[b"Project Gutenberg's Botchan (Master Darling), by Kin-nosuke Natsume\n",
 b'This eBook is for the use of anyone anywhere at no cost and with\n',
 b'almost no restrictions whatsoever.  You may copy it, give it away or\n',
 b're-use it under the terms of the Project Gutenberg License included\n',
 b'with this eBook or online at www.gutenberg.org\n',
 b'Title: Botchan (Master Darling)\n',
 b'Author: Kin-nosuke Natsume\n',
 b'Translator: Yasotaro Morri\n',
 b'Posting Date: October 14, 2012 [EBook #8868]\n',
 b'Release Date: September, 2005\n',
 b'First Posted: August 17, 2003\n',
 b'Language: English\n',
 b'*** START OF THIS PROJECT GUTENBERG EBOOK BOTCHAN (MASTER DARLING) ***\n',
 b'Produced by David Starner and the Online Distributed Proofreading Team\n',
 b'BOTCHAN (MASTER DARLING)\n',
 b'By The Late Mr. Kin-nosuke Natsume\n',
 b'TRANSLATED By Yasotaro Morri\n',
 b'Revised by J. R. KENNEDY\n',
 b'1919\n',
 b'A NOTE BY THE TRANSLATOR\n',
 b'No translation can expect to equal, much less

In [10]:
docs = ["The earth is an awesome place live",
       "No translation can expect to equal, much less to excel, the original.",
       "The task of the translator becomes doubly hazardous.",
       "These remarks are made not in way of excuse for any faulty dictions"]

In [11]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(docs)

In [12]:
print("word_index : ",tokenizer.word_index)

word_index :  {'the': 1, 'to': 2, 'of': 3, 'earth': 4, 'is': 5, 'an': 6, 'awesome': 7, 'place': 8, 'live': 9, 'no': 10, 'translation': 11, 'can': 12, 'expect': 13, 'equal': 14, 'much': 15, 'less': 16, 'excel': 17, 'original': 18, 'task': 19, 'translator': 20, 'becomes': 21, 'doubly': 22, 'hazardous': 23, 'these': 24, 'remarks': 25, 'are': 26, 'made': 27, 'not': 28, 'in': 29, 'way': 30, 'excuse': 31, 'for': 32, 'any': 33, 'faulty': 34, 'dictions': 35}


In [40]:
word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 


wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in docs ]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 36
Vocabulary Sample: [('the', 1), ('to', 2), ('of', 3), ('earth', 4), ('is', 5), ('an', 6), ('awesome', 7), ('place', 8), ('live', 9), ('no', 10)]


In [15]:
wids

[[1, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 2, 14, 15, 16, 2, 17, 1, 18],
 [1, 19, 3, 1, 20, 21, 22, 23],
 [24, 25, 26, 27, 28, 29, 30, 3, 31, 32, 33, 34, 35]]

In [14]:
from keras.preprocessing.sequence import skipgrams

In [23]:
#pos and neg sampling
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=3) for wid in wids]

In [24]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(an (6), the (1)) -> 1
(awesome (7), an (6)) -> 1
(awesome (7), translation (11)) -> 0
(awesome (7), live (9)) -> 1
(awesome (7), place (8)) -> 1
(an (6), is (5)) -> 1
(is (5), excuse (31)) -> 0
(is (5), remarks (25)) -> 0
(the (1), less (16)) -> 0
(place (8), awesome (7)) -> 1


In [41]:
from keras.layers import dot
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential

# build skip-gram architecture
# word_model = Sequential()
# word_model.add(Embedding(vocab_size, embed_size,
#                          embeddings_initializer="glorot_uniform",
#                          input_length=1))
# word_model.add(Reshape((embed_size, )))

# context_model = Sequential()
# context_model.add(Embedding(vocab_size, embed_size,
#                   embeddings_initializer="glorot_uniform",
#                   input_length=1))
# context_model.add(Reshape((embed_size,)))

#https://stackoverflow.com/questions/52542275/merging-layers-on-keras-dot-product/52542847

# now perform the dot product operation  
# dot_product = dot([word_model, context_model], axes=1)
# dot_product = Reshape((1,))(dot_product)

# # add the sigmoid output layer
# output = Dense(1, activation='sigmoid')(dot_product)

# model = Model(input=[input_target, input_context], output=output)
# model.compile(loss='mean_squared_error', optimizer='rmsprop')

# # view model summary
# print(model.summary())


# model = Sequential()
# model.add(dot([word_model, context_model], axes=1, normalize=False))
# #model.add(Merge([word_model, context_model], mode="dot"))
# model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
# model.compile(loss="mean_squared_error", optimizer="rmsprop")

#view model summary
#print(model.summary())


In [42]:
from keras.models import Model

In [43]:
from keras.layers import Input
input_target = Input((1,))
input_context = Input((1,))
embed_size = 10
embedding = Embedding(vocab_size, embed_size, input_length=1, name='embedding')

word_embedding = embedding(input_target)
word_embedding = Reshape((embed_size, 1))(word_embedding)
context_embedding = embedding(input_context)
context_embedding = Reshape((embed_size, 1))(context_embedding)

# now perform the dot product operation  
dot_product = dot([word_embedding, context_embedding], axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

model = Model(input=[input_target, input_context], output=output)
model.compile(loss='mean_squared_error', optimizer='rmsprop')



In [44]:
# view model summary
print(model.summary())


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 10)        360         input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
reshape_22 (Reshape)            (None, 10, 1)        0           embedding[0][0]            

In [49]:
for epoch in range(1, 4):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 0.9985352903604507
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 0.9981524497270584
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 0.9977532476186752


# CBOW

In [56]:
from keras.preprocessing import sequence
from keras.utils import np_utils
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            

In [57]:
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=2, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['the', 'earth', 'an', 'awesome'] -> Target (Y): is
Context (X): ['earth', 'is', 'awesome', 'place'] -> Target (Y): an
Context (X): ['is', 'an', 'place', 'live'] -> Target (Y): awesome
Context (X): ['no', 'translation', 'expect', 'to'] -> Target (Y): can
Context (X): ['translation', 'can', 'to', 'equal'] -> Target (Y): expect
Context (X): ['can', 'expect', 'equal', 'much'] -> Target (Y): to
Context (X): ['expect', 'to', 'much', 'less'] -> Target (Y): equal
Context (X): ['to', 'equal', 'less', 'to'] -> Target (Y): much
Context (X): ['equal', 'much', 'to', 'excel'] -> Target (Y): less
Context (X): ['much', 'less', 'excel', 'the'] -> Target (Y): to
Context (X): ['less', 'to', 'the', 'original'] -> Target (Y): excel


In [65]:
np.argwhere(y[0])[0][0]


17

In [68]:

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

window_size = 2

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 4, 10)             360       
_________________________________________________________________
lambda_1 (Lambda)            (None, 10)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 36)                396       
Total params: 756
Trainable params: 756
Non-trainable params: 0
_________________________________________________________________
None


In [69]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch: 1 	Loss: 143.71650791168213

Epoch: 2 	Loss: 143.24026441574097

Epoch: 3 	Loss: 142.96960997581482

Epoch: 4 	Loss: 142.6958372592926

Epoch: 5 	Loss: 142.4157645702362

