### importing require packages

In [14]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.engine import Input
from keras.layers import Embedding, merge
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random

## Instantiate Embeddings 

In [15]:
embeddings = Embeddings(100, 4, 1, 4)

Loading the embeddings from the cache


### getting data from preprocessing

In [16]:
word2vec_weights = embeddings.get_weights()
word2index, index2word = embeddings.get_vocabulary()
word2vec_model = embeddings.get_model()
tokenized_indexed_sentences = embeddings.get_tokenized_indexed_sentences()

### generating training data

In [17]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)

42047


In [18]:
seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences:
    for i in range(len(sentence)-window_size-1):
        x = sentence[i:i + window_size]
        y = sentence[i + window_size]
        seq_in.append(x)#[]
        seq_out.append(word2vec_weights[y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  47295


In [19]:
seq_in.shape


(47295, 5)

## Defining model

In [20]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=word2vec_weights.shape[0], output_dim=word2vec_weights.shape[1], weights=[word2vec_weights]))
model.add(LSTM(1024,return_sequences =True))
model.add(Dropout(0.2))
model.add(LSTM(512))
#model.add(Dropout(0.2))
model.add(Dense(word2vec_weights.shape[1], activation='relu'))
model.load_weights("../weights/lstm-2-1024-512-batchsize-128-epochs-25/weights.24-0.22.hdf5")
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         4204700   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 1024)        4608000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 1024)        0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
dense_2 (Dense)              (None, 100)               51300     
Total params: 12,011,776
Trainable params: 12,011,776
Non-trainable params: 0
_________________________________________________________________


In [21]:
model_weights_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, mode='max')

## Train Model

In [22]:
#model_fit_summary = model.fit(seq_in, seq_out, epochs=25, verbose=1, validation_split=0.2, batch_size=128, callbacks=[checkpoint])

### model predict

In [36]:
np.array(pattern[0])

array([  33,    1, 1612,    6,  198, 6646,    3,    0, 6035, 2113])

In [43]:
list(sent[0])

[33, 1, 1612, 6, 198, 6646, 3, 0, 6035, 2113]

In [42]:
list(seq_in)

[array([   0,   31,    2,    0, 1664]),
 array([  31,    2,    0, 1664, 1591]),
 array([  15, 1664,  150,  326,  209]),
 array([ 134,  206, 6987,   21,   36]),
 array([ 206, 6987,   21,   36,    1]),
 array([6987,   21,   36,    1,   75]),
 array([ 21,  36,   1,  75, 209]),
 array([ 36,   1,  75, 209, 219]),
 array([  1,  75, 209, 219, 511]),
 array([ 75, 209, 219, 511,  29]),
 array([ 25,   7, 209, 219, 511]),
 array([  15, 1664,  150, 1232,    0]),
 array([1664,  150, 1232,    0, 4340]),
 array([ 150, 1232,    0, 4340,   33]),
 array([1232,    0, 4340,   33,  209]),
 array([  1,  41,   7, 209, 219]),
 array([ 41,   7, 209, 219, 511]),
 array([  7, 209, 219, 511,  98]),
 array([ 15, 150, 326, 209, 219]),
 array([150, 326, 209, 219, 511]),
 array([326, 209, 219, 511,   1]),
 array([209, 219, 511,   1, 164]),
 array([219, 511,   1, 164,   5]),
 array([511,   1, 164,   5,   0]),
 array([  1, 164,   5,   0, 209]),
 array([164,   5,   0, 209, 219]),
 array([  1, 759,   7, 209, 219]),
 arra

In [64]:
start = 0
sentence_test = "In which regions in particular did"
indexed_sentences = embeddings.tokenize_index_sentence(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences)
#pattern = list(seq_in[start])
pattern = list(sent[start])
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(5):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]

indexed_sentences  [[3, 15, 949, 3, 1878, 7]]
" in which regions in particular did
sonderbundskrieg corrientes corrientes profitabl profitabl 

In [54]:
e_model = embeddings.get_model()

In [63]:
e_model.similar_by_word("profitabl")

[('coca-cola', 0.7328959703445435),
 ('khubilai', 0.7250775098800659),
 ('zimbabwe', 0.7238492965698242),
 ('pitatus', 0.717526912689209),
 ('underpays', 0.7077121734619141),
 ('somer', 0.6942623257637024),
 ('letterman', 0.6938465237617493),
 ('psychiatrist', 0.6875547170639038),
 ('vetting', 0.6872419118881226),
 ('chomsky', 0.6800742745399475)]

## Accuracy

In [74]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity == 1:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [75]:
#seq_out[0]

In [None]:
accuracy()

KeyboardInterrupt: 

In [68]:
model_results = model_fit_summary.history

NameError: name 'model_fit_summary' is not defined

In [23]:
model_results.update(model_fit_summary.params)

In [67]:
model_results["train_accuracy"] = accuracy()

Accuracy 0.0


NameError: name 'model_results' is not defined

In [28]:
# n = no. of predictions
# accuracy = accuracy(400)
#print(model_results)

{'val_acc': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'do_validation': True, 'metrics': ['loss', 'acc', 'val_loss', 'val_acc'], 'samples': 1, 'acc': [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'epochs': 15, 'val_loss': [4.5900764465332031, 4.5754680633544922, 4.5907573699951172, 4.6593356132507324, 4.8363356590270996, 5.0636167526245117, 5.1062922477722168, 5.0171608924865723, 4.8945426940917969, 4.7707195281982422, 4.683952808380127, 4.6261215209960938, 4.5961475372314453, 4.588953971862793, 4.5975632667541504], 'batch_size': 128, 'train_accuracy': None, 'verbose': 1, 'loss': [2.5437352657318115, 2.4388267993927002, 2.2880129814147949, 2.1385490894317627, 1.9580711126327515, 1.8592877388000488, 1.8720529079437256, 1.863358736038208, 1.8261202573776245, 1.831728458404541, 1.8086607456207275, 1.788567066192627, 1.796411395072937, 1.7958264350891113, 1.7889491319656372]}


In [26]:
text_file_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25/model_results.json"

In [27]:
with open(text_file_path, "w") as f:
        json.dump(model_results, f)
        