### importing require packages

In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Bidirectional
from keras.preprocessing import sequence
from intersect_embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random
from itertools import groupby

Using TensorFlow backend.


## Instantiate Embeddings 

In [2]:
embeddings = Embeddings(300, 4, 1, 4)

### Getting data from preprocessing

In [3]:
word2vec_model = embeddings.get_intersected_model()
word2index, index2word = embeddings.get_vocabulary()
word2vec_weights = word2vec_model.wv.syn0
tokenized_indexed_sentences = embeddings.get_indexed_sentences()

Loading Indexed Sentences...


In [4]:
word2index = {word:index+1 for word, index in word2index.items()}
index2word = {index:word for word, index in word2index.items()}

In [5]:
word2index

{'cardiologist': 16130,
 '2006_to_2013': 44097,
 'extratropical': 18745,
 'buffer': 38176,
 'meisporangia': 32247,
 'tukugawa': 23628,
 'worker': 4427,
 'mechanize': 44411,
 'drain-source': 48469,
 'forcing': 29530,
 'spotted': 14339,
 'lefevre': 45598,
 'oxide': 3369,
 'jungle': 19978,
 'byelorussian_soviet_socialist_republic': 25980,
 'accreditation': 9737,
 'saturn': 4979,
 'dinghai': 32179,
 'secondarily': 43588,
 'repayment': 17785,
 'abound': 47760,
 'the_geneva_conference': 36099,
 'premiere': 3511,
 'deep-fried': 38238,
 'the_red_book_cd-da': 22530,
 'kindergartens': 25779,
 'oranges': 48300,
 'notebooks': 9927,
 '1534': 20855,
 '3-stage': 48158,
 'polonia_warsaw': 30824,
 'dealkalization': 25914,
 'duke_of_kent': 49502,
 'psychologists': 10074,
 'chlamydomonas': 52308,
 'as-502': 30581,
 '1000': 4277,
 'greenman': 34291,
 'salamis': 49523,
 'thieves': 48599,
 'crank': 45971,
 'wupv': 29905,
 'spring': 3023,
 'dangerous': 4370,
 'the_west_end': 24236,
 'the_institute_of_russian

In [6]:
tokenized_indexed_sentences[0]

[1, 3, 2206, 9, 388, 498, 93, 108, 5, 0]

In [7]:
tokenized_indexed_sentences = [np.array(sentence) + 1 for sentence in tokenized_indexed_sentences if len(sentence) > 0]

In [8]:
tokenized_indexed_sentences[0]

array([   2,    4, 2207,   10,  389,  499,   94,  109,    6,    1])

In [9]:
new_weights = np.zeros((1, word2vec_weights.shape[1]))

In [10]:
new_weights = np.append(new_weights, word2vec_weights, axis=0)

In [11]:
new_weights.shape

(52731, 300)

In [12]:
new_weights[52730]

array([ 0.35742188,  0.03369141, -0.03881836,  0.07666016, -0.06079102,
        0.6328125 ,  0.05615234,  0.04345703,  0.00265503, -0.21582031,
        0.40234375, -0.0559082 , -0.15820312,  0.21289062,  0.28710938,
        0.54296875, -0.13085938,  0.14746094,  0.06738281, -0.171875  ,
        0.07373047, -0.0006485 , -0.10986328, -0.13476562,  0.06152344,
       -0.03833008, -0.07519531, -0.00221252,  0.09179688, -0.37890625,
       -0.31054688, -0.07666016, -0.484375  , -0.0546875 , -0.13183594,
       -0.33203125,  0.20996094,  0.25      ,  0.0534668 ,  0.08496094,
       -0.1875    ,  0.09960938,  0.24902344, -0.07714844, -0.01123047,
       -0.06787109,  0.21191406, -0.11865234, -0.01660156,  0.22265625,
       -0.37695312,  0.36914062, -0.51171875,  0.06640625, -0.19726562,
       -0.01818848,  0.0612793 , -0.21582031,  0.13574219, -0.08154297,
        0.18652344,  0.3203125 ,  0.26367188,  0.24609375,  0.01208496,
        0.04931641,  0.18652344,  0.29296875,  0.21289062,  0.06

### generating training data

In [13]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)

52730


In [14]:
maxlen = max([len(sentence) for sentence in tokenized_indexed_sentences])

In [15]:
tokenized_indexed_sentences = sequence.pad_sequences(tokenized_indexed_sentences)

In [16]:
seq_in = []
seq_out = []
# generating dataset
tokenized_indexed_sentences = [sentence for sentence in tokenized_indexed_sentences if len(sentence) > 0]
for sentence in tokenized_indexed_sentences:
    x = sentence
    y = np.append(sentence[1:], np.array(sentence[len(sentence)-1]))
    seq_in.append(x)
    seq_out.append([new_weights[index] for index in y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  97974


## Defining model

In [17]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=new_weights.shape[0], output_dim=new_weights.shape[1], weights=[new_weights], mask_zero=True))
model.add(Bidirectional(LSTM(1024, return_sequences=True), merge_mode="ave"))
model.add(Bidirectional(LSTM(300, return_sequences=True), merge_mode="ave"))
model.compile(loss='cosine_proximity', optimizer='adam',metrics=['accuracy'])
model.summary()

ValueError: Invalid merge mode. Merge mode should be one of {"sum", "mul", "ave", "concat", None}

In [18]:
model_weights_path = "../weights/lstm-3-1024-1024-batchsize-256-epochs-30-Sequence"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=False, mode='max')

## Train Model

In [None]:
model.fit(seq_in, seq_out, epochs=1, verbose=30, batch_size=256, callbacks=[checkpoint])

Epoch 1/30


### model predict

In [32]:
start = 0
sentence_test = "In which regions in particular did"
indexed_sentences = embeddings.get_indexed_query(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences)
#pattern = list(seq_in[start])
pattern = list(sent)
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(10):
    prediction = model.predict(np.array([pattern]))
    pred_word = word2vec_model.similar_by_vector(prediction[0][prediction.shape[1] - 1])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]

indexed_sentences  [1, 5, 17, 875, 5, 1707, 9]
" squadstart in which regions in particular did
the what egyptians not egyptians in egyptians cradling them the 

In [None]:
#e_model = embeddings.get_model()

In [None]:
#e_model.similar_by_word("profitabl")

## Accuracy

In [None]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity == 1:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [None]:
#seq_out[0]

In [None]:
# accuracy()

In [None]:
#model_results = model_fit_summary.history

In [None]:
#model_results.update(model_fit_summary.params)

In [None]:
#model_results["train_accuracy"] = accuracy()

In [None]:
# n = no. of predictions
# accuracy = accuracy(400)
#print(model_results)

In [None]:
#text_file_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25/model_results.json"

In [None]:
#with open(text_file_path, "w") as f:
        #json.dump(model_results, f)
        