### importing require packages

In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

Using TensorFlow backend.


EMBEDDING(100,4,1,4) STARTED .....
Loading the embeddings from the cache
EMBEDDING(100,4,1,4) COMPLETED .....


In [2]:
np.mean([1, 2, 3])

2.0

## Instantiate Embeddings 

In [3]:
embeddings = Embeddings(100, 4, 1, 4)

Loading the embeddings from the cache


### getting data from preprocessing

In [4]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

### generating training data

In [5]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)
#sorted(window_size,reverse=True)
#sentence_max_length = max([len(sentence) for sentence in tokenized_indexed_sentence ])

42047


## Defining model

In [6]:
model_weights_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)

In [8]:
seq_in = []
seq_out = []

# generating dataset
for sentence in tokenized_indexed_sentences:
    sentence_seq_in = []
    sentence_seq_out = []
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        sentence_seq_in.append(x)#[]
        sentence_seq_out.append(word2vec_weights[y])
    seq_in.append(sentence_seq_in)
    seq_out.append(sentence_seq_out)

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  10410


In [9]:
subsamples = np.array([len(seq) for seq in seq_in])
print(np.sum(subsamples))

47295


In [10]:
subsamples_in = np.array([s for seq in seq_in for s in seq])
subsamples_out = np.array([s for seq in seq_out for s in seq])

## Train Model

In [11]:
np.expand_dims(seq_in[0][0], axis=1)

array([[   0],
       [  31],
       [   2],
       [   0],
       [1664]])

In [12]:
total_batches = int(subsamples_in.shape[0] / 256)

In [13]:
batch_len = []
for i in range(total_batches):
    batch_len.append(len(subsamples_in[i::total_batches]))
min_batch_len = min(batch_len)

In [47]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights], batch_input_shape=(min_batch_len, 5)))
model.add(LSTM(512, return_sequences=True, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(512, stateful=True))
model.add(Dropout(0.1))
model.add(Dense(word2vec_weights.shape[1], activation='relu'))
#model.load_weights("../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful/weights-10-0.9673129916191101")
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (257, 5, 100)             4204700   
_________________________________________________________________
lstm_5 (LSTM)                (257, 5, 512)             1255424   
_________________________________________________________________
dropout_5 (Dropout)          (257, 5, 512)             0         
_________________________________________________________________
lstm_6 (LSTM)                (257, 512)                2099200   
_________________________________________________________________
dropout_6 (Dropout)          (257, 512)                0         
_________________________________________________________________
dense_3 (Dense)              (257, 100)                51300     
Total params: 7,610,624
Trainable params: 7,610,624
Non-trainable params: 0
_________________________________________________________________


In [48]:
print("Train")
for epoch in range(1):
    print("Epoch {0}/{1}".format(epoch+1, 1))
    mean_tr_accuracy = []
    mean_tr_loss = []
    for i in range(total_batches):
        # print("Done with {0}/{1} batches".format(i, total_batches))
        train_accuracy, train_loss = model.train_on_batch(subsamples_in[i::total_batches][:min_batch_len], subsamples_out[i::total_batches][:min_batch_len])
        mean_tr_accuracy.append(train_accuracy)
        mean_tr_loss.append(train_loss)
    mean_accuracy = np.mean(mean_tr_accuracy)
    mean_loss = np.mean(mean_tr_loss)
    print("Mean Accuracy", mean_accuracy)
    print("Mean Loss", mean_loss)
    filepath = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful/weights-{0}-{1}".format(epoch+1, mean_accuracy, mean_loss)
    model.save_weights(filepath)

Train
Epoch 1/1


KeyboardInterrupt: 

### model predict

In [22]:
start = 20
samples = subsamples_in[start::total_batches][:min_batch_len]
predictions = model.predict_on_batch(samples)
for index, prediction in enumerate(predictions):
    print(' '.join(index2word[index] for index in samples[index]))
    pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
    sys.stdout.write("*"+pred_word+" \n")

won super bowl 50 what
*minority 
have during the super bowl
*minority 
bowl 50 'an important game
*financial 
prios has the sun life
*financial 
50 what was the last
*minority 
15 regular season games since
*minority 
how many interceptions are the
*financial 
what was ronnie hillman 's
*minority 
left in the game when
*minority 
manning took how many different
*financial 
player did the field problem
*minority 
city 's marriott did the
*minority 
former students went on to
*minority 
money was spent on other
*financial 
that gives local companies business
*financial 
station could an american viewer
*financial 
rate for a 30-second ad
*financial 
what movie company paid to
*minority 
british commentators include darren fletcher
*financial 
was the pass on the
*minority 
thought he called for a
*financial 
who bumped the ball away
*financial 
though he broke his arm
*financial 
population of warsaw was jewish
*financial 
national gallery of art organize
*financial 
sigismund iii vasa 

*minority 
what served as a justification
*financial 
thought what was needed for
*minority 
why was the student group
*minority 
those tainted by what to
*minority 
the umc believes that jesus
*financial 
is the founder of the
*minority 
if they are appointed as
*minority 
the 2008 general conference what
*minority 
where was marin 's second
*minority 
pitt 's plan called for
*minority 
laws of physics of galileo
*financial 
force called when two forces
*financial 
of a ship land according
*financial 
earth in a formula about
*minority 
what is the repulsive force
*financial 


## Accuracy

In [43]:
def accuracy():
    start = 27
    count = 0
    correct = 0
    predictions = model.predict_on_batch(subsamples_in[start::total_batches][:min_batch_len])
    ytrue = subsamples_out[start::total_batches][:min_batch_len]
    for index, prediction in enumerate(predictions):
        pred_word = word2vec_model.similar_by_vector(prediction)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue[index])[0][0]
        sim = word2vec_model.similarity(pred_word, true_word)
        if (sim >= 0.85):
            correct += 1
        count += 1
    accur = float(correct/(count))
    accuracy = 'accuracy = ' + str(float(accur))
    return accuracy
    

In [44]:
# n = no. of predictions
accuracy = accuracy()
print(accuracy)

accuracy = 0.22957198443579765


In [45]:
text_file_path = "../weights/LSTM-2-512-Window-5-Batch-128-Epoch-10-Stateful/model_data.txt"

In [46]:
with open(text_file_path, 'w') as file:
    file.write(accuracy)
    