### importing require packages

In [19]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

In [20]:
np.mean([1, 2, 3])

2.0

## Instantiate Embeddings 

In [21]:
embeddings = Embeddings(100, 4, 1, 4)

Loading the embeddings from the cache


### getting data from preprocessing

In [22]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

### generating training data

In [23]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)
#sorted(window_size,reverse=True)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])

132184


## Defining model

In [24]:
model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)

In [25]:
seq_in = []
seq_out = []

# generating dataset
for sentence in tokenized_indexed_sentences:
    sentence_seq_in = []
    sentence_seq_out = []
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        sentence_seq_in.append(x)#[]
        sentence_seq_out.append(word2vec_weights[y])
    seq_in.append(sentence_seq_in)
    seq_out.append(sentence_seq_out)

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  18473


In [26]:
subsamples = np.array([len(seq) for seq in seq_in])
print(np.sum(subsamples))

252670


In [27]:
subsamples_in = np.array([s for seq in seq_in for s in seq])
subsamples_out = np.array([s for seq in seq_out for s in seq])

## Train Model

In [28]:
np.expand_dims(seq_in[0][0], axis=1)

array([[535],
       [592],
       [736],
       [  8],
       [ 25]])

In [29]:
total_batches = int(subsamples_in.shape[0] / 256)

In [30]:
batch_len = []
for i in range(total_batches):
    batch_len.append(len(subsamples_in[i::total_batches]))
min_batch_len = min(batch_len)

In [31]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights], batch_input_shape=(min_batch_len, 5)))
model.add(LSTM(512, return_sequences=True, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(512, stateful=True))
model.add(Dropout(0.1))
model.add(Dense(word2vec_weights.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (256, 5, 100)             13218400  
_________________________________________________________________
lstm_3 (LSTM)                (256, 5, 512)             1255424   
_________________________________________________________________
dropout_3 (Dropout)          (256, 5, 512)             0         
_________________________________________________________________
lstm_4 (LSTM)                (256, 512)                2099200   
_________________________________________________________________
dropout_4 (Dropout)          (256, 512)                0         
_________________________________________________________________
dense_2 (Dense)              (256, 100)                51300     
Total params: 16,624,324
Trainable params: 16,624,324
Non-trainable params: 0
________________________________________________________________

In [33]:
print("Train")
for epoch in range(15):
    print("Epoch {0}/{1}".format(epoch+1, 15))
    mean_tr_accuracy = []
    mean_tr_loss = []
    for i in range(total_batches):
        # print("Done with {0}/{1} batches".format(i, total_batches))
        train_accuracy, train_loss = model.train_on_batch(subsamples_in[i::total_batches][:min_batch_len], subsamples_out[i::total_batches][:min_batch_len])
        mean_tr_accuracy.append(train_accuracy)
        mean_tr_loss.append(train_loss)
        model.reset_states()
    mean_accuracy = np.mean(mean_tr_accuracy)
    mean_loss = np.mean(mean_tr_loss)
    print("Mean Accuracy", mean_tr_accuracy)
    print("Mean Loss", mean_tr_loss)
    filepath = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful/weights-{0}-{1}".format(epoch+1, mean_accuracy, mean_loss)
    model.save_weights(filepath)

Train
Epoch 1/15
Done with 0/986 batches
Done with 1/986 batches
Done with 2/986 batches
Done with 3/986 batches
Done with 4/986 batches
Done with 5/986 batches
Done with 6/986 batches
Done with 7/986 batches
Done with 8/986 batches
Done with 9/986 batches
Done with 10/986 batches
Done with 11/986 batches
Done with 12/986 batches
Done with 13/986 batches
Done with 14/986 batches
Done with 15/986 batches
Done with 16/986 batches
Done with 17/986 batches
Done with 18/986 batches
Done with 19/986 batches
Done with 20/986 batches
Done with 21/986 batches
Done with 22/986 batches
Done with 23/986 batches
Done with 24/986 batches
Done with 25/986 batches
Done with 26/986 batches
Done with 27/986 batches
Done with 28/986 batches
Done with 29/986 batches
Done with 30/986 batches
Done with 31/986 batches
Done with 32/986 batches
Done with 33/986 batches
Done with 34/986 batches
Done with 35/986 batches
Done with 36/986 batches
Done with 37/986 batches
Done with 38/986 batches
Done with 39/986 b

Done with 319/986 batches
Done with 320/986 batches
Done with 321/986 batches
Done with 322/986 batches
Done with 323/986 batches
Done with 324/986 batches
Done with 325/986 batches
Done with 326/986 batches
Done with 327/986 batches
Done with 328/986 batches
Done with 329/986 batches
Done with 330/986 batches
Done with 331/986 batches
Done with 332/986 batches
Done with 333/986 batches
Done with 334/986 batches
Done with 335/986 batches
Done with 336/986 batches
Done with 337/986 batches
Done with 338/986 batches
Done with 339/986 batches
Done with 340/986 batches
Done with 341/986 batches
Done with 342/986 batches
Done with 343/986 batches
Done with 344/986 batches
Done with 345/986 batches
Done with 346/986 batches
Done with 347/986 batches
Done with 348/986 batches
Done with 349/986 batches
Done with 350/986 batches
Done with 351/986 batches
Done with 352/986 batches
Done with 353/986 batches
Done with 354/986 batches
Done with 355/986 batches
Done with 356/986 batches
Done with 35

Done with 635/986 batches
Done with 636/986 batches
Done with 637/986 batches
Done with 638/986 batches
Done with 639/986 batches
Done with 640/986 batches
Done with 641/986 batches
Done with 642/986 batches
Done with 643/986 batches
Done with 644/986 batches
Done with 645/986 batches
Done with 646/986 batches
Done with 647/986 batches
Done with 648/986 batches
Done with 649/986 batches
Done with 650/986 batches
Done with 651/986 batches
Done with 652/986 batches
Done with 653/986 batches
Done with 654/986 batches
Done with 655/986 batches
Done with 656/986 batches
Done with 657/986 batches
Done with 658/986 batches
Done with 659/986 batches
Done with 660/986 batches
Done with 661/986 batches
Done with 662/986 batches
Done with 663/986 batches
Done with 664/986 batches
Done with 665/986 batches
Done with 666/986 batches
Done with 667/986 batches
Done with 668/986 batches
Done with 669/986 batches
Done with 670/986 batches
Done with 671/986 batches
Done with 672/986 batches
Done with 67

Done with 951/986 batches
Done with 952/986 batches
Done with 953/986 batches
Done with 954/986 batches
Done with 955/986 batches
Done with 956/986 batches
Done with 957/986 batches
Done with 958/986 batches
Done with 959/986 batches
Done with 960/986 batches
Done with 961/986 batches
Done with 962/986 batches
Done with 963/986 batches
Done with 964/986 batches
Done with 965/986 batches
Done with 966/986 batches
Done with 967/986 batches
Done with 968/986 batches
Done with 969/986 batches
Done with 970/986 batches
Done with 971/986 batches
Done with 972/986 batches
Done with 973/986 batches
Done with 974/986 batches
Done with 975/986 batches
Done with 976/986 batches
Done with 977/986 batches
Done with 978/986 batches
Done with 979/986 batches
Done with 980/986 batches
Done with 981/986 batches
Done with 982/986 batches
Done with 983/986 batches
Done with 984/986 batches
Done with 985/986 batches
Mean Accuracy [1.1103017, 1.0904403, 1.1443708, 1.0977936, 1.0995524, 1.0935338, 1.0097511

### model predict

In [73]:
start = 20
samples = subsamples_in[start::total_batches][:min_batch_len]
predictions = model.predict_on_batch(samples)
for index, prediction in enumerate(predictions):
    print(' '.join(index2word[index] for index in samples[index]))
    pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
    sys.stdout.write("*"+pred_word+" \n")

afc champion denver broncos defeated
*redemption 
in the pro bowl thomas
*pcf-94 
of the turf collapsed under
*redemption 
and post-game coverage while martin
*pcf-94 
super bowl record 61-yard return
*complete 
freely and royal carps in
*redemption 
host a permanent exhibition of
*revenue-sharing 
the prince of płock bolesław
*ode 
the plain vistula terraces flooded
*enclosure 
a registration number depends on
*redemption 
the rivers of france evolved
*mind 
force of franks into the
*pcf-94 
of arguments as to whether
*redemption 
did not have the rich
*deity 
his earlier illnessin 1875 tesla
*promise 
tesla 's patent would probably
*redemption 
of the tesla coil the
*deity 
the letter s dotdotdot in
*redemption 
their former star inventor was
*cinemascore 
said that he had been
*delayed 
invented or envisioned by tesla
*redemption 
not intended as a practical
*redemption 
which are defined using quantum
*database 
would be a major breakthrough
*receiving 
be responsible for some or
*

*complete 
the feynman diagram represents any
*mind 
the sources of the fields
*receiving 
way that the direction and
*matter 
the spring meetings of the
*kickback 
oldest quarterback to play in
*circulation 
the 50 given to the
*memorialmost 
denver score at the end
*given 
the fighters of the warsaw
*memorialmost 
offer tesla to redesign a
*pcf-94 
kind of memory was tesla
*complete 
used to convey the continuum
*redemption 
type of accountant other than
*matter 
english translation of the bible
*job 
hymn from depths of woe
*redemption 
region of california is palm
*enclosure 
uk limited is formerly known
*job 
what was required of huguenot
*ode 
engines became popular for power
*receiving 
in what year did lavoisier
*redemption 
from the us became 5
*delayed 
crew members were required to
*mind 
defendant in the case of
*redemption 
member state nationals by the
*nco 
live plants were found to
*mind 
where is the neighborhood of
*15476 
who did internet2 partner with
*redemption 
t

## Accuracy

In [74]:
def accuracy():
    start = 27
    count = 0
    correct = 0
    predictions = model.predict_on_batch(subsamples_in[start::total_batches][:min_batch_len])
    ytrue = subsamples_out[start::total_batches][:min_batch_len]
    for index, prediction in enumerate(predictions):
        pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue[index])[0][0]
        sim = word2vec_model.similarity(pred_word, true_word)
        if (sim >= 0.85):
            correct +=1
        count += 1
    accur = float(correct/(count))
    print('accuracy = ', float(accur))
    

In [75]:
# n = no. of predictions
print(accuracy())

accuracy =  0.0
None
