### importing require packages

In [270]:
from __future__ import print_function

import json
import os
import numpy as np
import sys

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM

from nltk.tokenize import word_tokenize

### tokenizer function

In [271]:
# tokenizer: can change this as needed
# takes input one sentence at a time and returns individual words list
tokenize = lambda x: word_tokenize(x)

### create embeding and store weights for whole vocab

In [272]:
def create_embeddings(data_dir,
                      embeddings_path='lstm2-w2vec/embeddings.npz',
                      vocab_path='lstm2-w2vec/vocab.json',
                      **params):
    """
    Generate embeddings from a batch of text
    :param embeddings_path: where to save the embeddings
    :param vocab_path: where to save the word-index map
    """

    class SentenceGenerator(object):
        """
        Say we want to further preprocess the words from the files — convert to unicode, lowercase, 
        remove numbers, extract named entities… All of this can be done inside the MySentences iterator 
        and word2vec doesn’t need to know. All that is required is that the input yields one 
        sentence (list of utf8 words) after another
        
        """
        
        def __init__(self, dirname):
            self.dirname = os.path.join(os.getcwd(), dirname)
        
        ## iterator to yield tokenized format of one sentence at  a time
        def __iter__(self):
            for fname in os.listdir(self.dirname):
                for line in open(os.path.join(self.dirname, fname)):
                    yield tokenize(line.lower())

    """
    Gensim only requires that the input must provide sentences sequentially, when iterated over. 
    No need to keep everything in RAM: we can provide one sentence, process it, 
    forget it, load another sentence    
    """
    ## provide data directory and it will take sentences one at a time from all the files in directory
        
    sentences = SentenceGenerator(data_dir)
    model = Word2Vec(sentences, **params)
    
    weights = model.wv.syn0
    
    ## storing weights to be later used in keras embedding layer
    np.save(open(embeddings_path, 'wb'), weights)
    
    ## generating word2index
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    with open(vocab_path, 'w') as f:
        f.write(json.dumps(vocab))
    return model,weights

In [273]:
def load_vocab(vocab_path='lstm2-w2vec/vocab.json'):
    """
    Load word -> index and index -> word mappings
    :param vocab_path: where the word-index map is saved
    :return: word2idx, idx2word
    """

    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

In [274]:
def word2vec_embedding_layer(embeddings_path='lstm2-w2vec/embeddings.npz'):
    """
    Generate an embedding layer word2vec embeddings
    :param embeddings_path: where the embeddings are saved (as a numpy file)
    :return: the generated embedding layer
    """
    
    weights = np.load(open(embeddings_path, 'rb'))
    layer = Embedding(input_dim=weights.shape[0],
                      output_dim=weights.shape[1],
                      #input_length=100,
                      weights=[weights])
    return layer

In [275]:
# specify embeddings in this environment variable
data_path = 'corpus'

# variable arguments are passed to gensim's word2vec model
w2v_model, weigths_ = create_embeddings(data_path, size=100, min_count=1,
                  window=5, sg=1, iter=25)

In [276]:
word2idx, idx2word = load_vocab()

In [277]:
vocab_size = len(word2idx.keys())
epoch = 1
batch_size = 32

In [278]:
filename = "corpus/dracula2.txt"
input_txt = open(filename).read()
input_txt = input_txt.lower()

In [279]:
tt = word_tokenize(input_txt)

In [280]:
x=[]
y=[]
window = 10
n_words = vocab_size
# generating dataset
for i in range(0, n_words - window, 1):
    seq_in = tt[i:i + window]
    seq_out = tt[i + window]
    x.append([word2idx[word] for word in seq_in])
    y.append(weigths_[word2idx[seq_out]])
n_patterns = len(x)
print ("Dataset size : ", n_patterns)

Dataset size :  77


In [281]:
y[0]

array([ -6.06145896e-03,  -2.04539206e-03,  -5.47052408e-03,
        -1.91847357e-05,   3.34095699e-03,  -5.90201758e-04,
         4.70602978e-03,   1.87155732e-03,   1.12713445e-02,
        -1.17304467e-03,  -1.87587249e-03,  -9.59220063e-03,
        -4.89463145e-03,   4.35876893e-03,   5.81940822e-03,
         4.84026223e-03,  -5.85034816e-03,  -1.33772357e-03,
         1.83229670e-02,  -4.62397560e-03,   4.27276082e-03,
        -8.61322787e-03,   4.59064217e-03,   3.86356772e-03,
        -6.02713699e-05,  -1.65506348e-03,  -1.63820281e-04,
        -6.69088447e-03,   7.56821409e-03,  -7.53825344e-03,
        -6.50216965e-03,  -1.12099078e-05,   3.06755910e-03,
         9.93078807e-04,  -1.43825011e-02,  -6.98222639e-03,
         2.07250263e-03,  -9.45661915e-04,  -2.89390818e-03,
         9.62779624e-04,   1.33635092e-03,   6.12916937e-03,
        -5.05341589e-03,   9.31442436e-03,   3.99526581e-03,
        -9.93528031e-03,  -1.31426398e-02,   2.72287987e-03,
         1.99323846e-03,

In [282]:
y = np.array(y)

In [283]:
x = np.array(x)

In [284]:
##layer = Embedding(input_dim=weights.shape[0],output_dim=weights.shape[1],weights=[weights])
model = Sequential()
model.add(word2vec_embedding_layer())
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, None, 100)         8700      
_________________________________________________________________
lstm_33 (LSTM)               (None, None, 512)         1255424   
_________________________________________________________________
dropout_25 (Dropout)         (None, None, 512)         0         
_________________________________________________________________
lstm_34 (LSTM)               (None, 64)                147712    
_________________________________________________________________
dropout_26 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 100)               6500      
Total params: 1,418,336
Trainable params: 1,418,336
Non-trainable params: 0
_________________________________________________________________


In [285]:
model.fit(x,y,batch_size=32,epochs=50,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f13025fedd8>

In [286]:
np.array([x[22]]).shape

(1, 10)

In [287]:
t1 = model.predict(np.array([x[22]]))

In [288]:
t1.shape

(1, 100)

In [289]:
w2v_model.similar_by_vector(t1[0])

[('would', 0.20618267357349396),
 ('which', 0.17116126418113708),
 ('morning', 0.13392218947410583),
 ('us', 0.1264861822128296),
 ('an', 0.12239344418048859),
 ('very', 0.108067087829113),
 ('width', 0.101803719997406),
 ('western', 0.10017617791891098),
 ('wonderful', 0.09842932969331741),
 ('rule', 0.0934724509716034)]

In [290]:
start = 0
pattern = list(x[start])
print("\"",' '.join(idx2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    index = 0 #np.argmax(prediction)
    pred_word = w2v_model.similar_by_vector(prediction[index])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

" 3 may . bistritz left munich at 8:35 p. m.
would would would would would would would would would would 