### importing require packages

In [1]:
from __future__ import print_function

import json
import os
import numpy as np
import sys
import h5py

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Bidirectional
from keras.preprocessing import sequence
from intersect_embeddings import Embeddings
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import word_tokenize
import random
from itertools import groupby

Using TensorFlow backend.


## Instantiate Embeddings 

In [2]:
embeddings = Embeddings(300, 4, 1, 4)

### Getting data from preprocessing

In [3]:
word2vec_model = embeddings.get_intersected_model()
word2index, index2word = embeddings.get_vocabulary()
word2vec_weights = word2vec_model.wv.syn0
tokenized_indexed_sentences = embeddings.get_indexed_sentences()

Loading Indexed Sentences...


In [4]:
word2index = {word:index+1 for word, index in word2index.items()}
index2word = {index:word for word, index in word2index.items()}

In [5]:
word2index

{'bmi': 33763,
 "'the_warriors'_movie": 47826,
 'kaddu': 38007,
 'aston_villa': 26424,
 'alignment': 9963,
 'old_frisian': 41768,
 'the_first_few_years': 30269,
 'tariah': 33355,
 'undeveloped': 33266,
 'temur_khan': 31423,
 'universal_postal_union': 33915,
 'the_admiral_of': 26364,
 'mid-1980': 35582,
 'b-plug': 43613,
 'the_new_york_cosmos': 29231,
 'sexually': 9178,
 'niet': 33854,
 'the_greatest_uk_science_fiction_series_ever': 28901,
 'avoided': 3926,
 'march_26': 30685,
 'tai-lo': 48160,
 'southern_hills_country_club': 36418,
 'pinkertons': 48061,
 'exile': 6159,
 'mau': 5766,
 'conventa': 36450,
 'experimenters': 30998,
 'hurricane': 6017,
 'novo': 17718,
 'conetemporary': 25138,
 'house_of_dereon': 28979,
 'ora': 50508,
 'five_million': 29109,
 'the_fourt_macedonian_war': 27873,
 'pope_pius_xxiii': 34632,
 '1917-1920': 43813,
 'tom_kenny': 34129,
 'lemon_v_kurtzman': 19837,
 'video-conference': 10772,
 'conquistador': 15291,
 'pillar': 21269,
 'zinke': 20228,
 'prabhakara': 112

In [6]:
tokenized_indexed_sentences[0]

[1, 3, 2206, 9, 388, 498, 93, 108, 5, 0]

In [7]:
tokenized_indexed_sentences = [np.array(sentence) + 1 for sentence in tokenized_indexed_sentences if len(sentence) > 0]

In [8]:
tokenized_indexed_sentences[0]

array([   2,    4, 2207,   10,  389,  499,   94,  109,    6,    1])

In [9]:
new_weights = np.zeros((1, word2vec_weights.shape[1]))

In [10]:
new_weights = np.append(new_weights, word2vec_weights, axis=0)

In [11]:
new_weights.shape

(52731, 300)

In [12]:
new_weights[52730]

array([ 0.35742188,  0.03369141, -0.03881836,  0.07666016, -0.06079102,
        0.6328125 ,  0.05615234,  0.04345703,  0.00265503, -0.21582031,
        0.40234375, -0.0559082 , -0.15820312,  0.21289062,  0.28710938,
        0.54296875, -0.13085938,  0.14746094,  0.06738281, -0.171875  ,
        0.07373047, -0.0006485 , -0.10986328, -0.13476562,  0.06152344,
       -0.03833008, -0.07519531, -0.00221252,  0.09179688, -0.37890625,
       -0.31054688, -0.07666016, -0.484375  , -0.0546875 , -0.13183594,
       -0.33203125,  0.20996094,  0.25      ,  0.0534668 ,  0.08496094,
       -0.1875    ,  0.09960938,  0.24902344, -0.07714844, -0.01123047,
       -0.06787109,  0.21191406, -0.11865234, -0.01660156,  0.22265625,
       -0.37695312,  0.36914062, -0.51171875,  0.06640625, -0.19726562,
       -0.01818848,  0.0612793 , -0.21582031,  0.13574219, -0.08154297,
        0.18652344,  0.3203125 ,  0.26367188,  0.24609375,  0.01208496,
        0.04931641,  0.18652344,  0.29296875,  0.21289062,  0.06

### generating training data

In [13]:
window_size = 5
vocab_size = len(word2index)
print(vocab_size)

52730


In [14]:
maxlen = max([len(sentence) for sentence in tokenized_indexed_sentences])

In [15]:
tokenized_indexed_sentences = sequence.pad_sequences(tokenized_indexed_sentences)

In [16]:
seq_in = []
seq_out = []
# generating dataset
tokenized_indexed_sentences = [sentence for sentence in tokenized_indexed_sentences if len(sentence) > 0]
for sentence in tokenized_indexed_sentences:
    x = sentence
    y = np.append(sentence[1:], np.array(sentence[len(sentence)-1]))
    seq_in.append(x)
    seq_out.append([new_weights[index] for index in y])

# converting seq_in and seq_out into numpy array
seq_in = np.array(seq_in)
seq_out = np.array(seq_out)
n_samples = len(seq_in)
print ("Number of samples : ", n_samples)

Number of samples :  97974


## Defining model

In [19]:
# Changes to the model to be done here
model = Sequential()
model.add(Embedding(input_dim=new_weights.shape[0], output_dim=new_weights.shape[1], weights=[new_weights], mask_zero=True))
model.add(Bidirectional(LSTM(, return_sequences=True), merge_mode="ave"))
model.add(Bidirectional(LSTM(300, return_sequences=True), merge_mode="ave"))
model.load_weights("../weights/bidirectional-lstm-2-1024-300-batchsize-256-epochs-30-Sequence/weights.29.hdf5")
model.compile(loss='cosine_proximity', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 300)         15819300  
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 1024)        10854400  
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 300)         3180000   
Total params: 29,853,700
Trainable params: 29,853,700
Non-trainable params: 0
_________________________________________________________________


In [18]:
model_weights_path = "../weights/lstm-3-1024-1024-batchsize-256-epochs-30-Sequence"
if not os.path.exists(model_weights_path):
    os.makedirs(model_weights_path)
checkpoint_path = model_weights_path + '/weights.{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=False, mode='max')

## Train Model

In [20]:
# model.fit(seq_in, seq_out, epochs=1,/ verbose=30, batch_size=256, callbacks=[checkpoint])

### model predict

In [45]:
start = 0
sentence_test = " who is the"
indexed_sentences = embeddings.get_indexed_query(sentence_test)
print("indexed_sentences ",indexed_sentences)
sent = np.array(indexed_sentences) + 1
#pattern = list(seq_in[start])
pattern = list(sent)
print("\"",' '.join(index2word[index] for index in pattern))
for i in range(2):
    prediction = model.predict(np.array([pattern]))
    print(len(prediction[0]))
    pred_word = word2vec_model.similar_by_vector(prediction[0][prediction.shape[1] - 1])[0][0]
    sys.stdout.write(pred_word+" ")
    pattern.append(word2index[pred_word])
    pattern = pattern[:len(pattern)]

indexed_sentences  [1, 12, 8, 2]
" squadstart who is the
4
the 5
the 

In [34]:
model.layers[1].get_weights()[0]

array([[-0.01984103,  0.00866925,  0.00596858, ..., -0.01896135,
         0.01253303, -0.01682685],
       [ 0.02944827,  0.03790607,  0.01673041, ..., -0.03337999,
         0.0015724 ,  0.01733136],
       [-0.00504541,  0.00439127, -0.01940113, ..., -0.02757988,
         0.01667905, -0.00050718],
       ..., 
       [ 0.04402941,  0.03036774,  0.00817353, ...,  0.02618253,
         0.01713058,  0.00941994],
       [-0.00202136,  0.00781708, -0.03179857, ..., -0.01009496,
        -0.00812547, -0.04278539],
       [-0.03006271,  0.02193416, -0.00921515, ...,  0.00953403,
         0.01885799, -0.0187456 ]], dtype=float32)

In [37]:
len(tokenized_indexed_sentences)

97974

In [None]:
#e_model = embeddings.get_model()

In [None]:
#e_model.similar_by_word("profitabl")

## Accuracy

In [None]:
def accuracy():
    count = 0
    correct = 0
    for sub_sample_in, sub_sample_out in zip(seq_in, seq_out):
        ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis=0))[0]
        ytrue = sub_sample_out
        pred_word = word2vec_model.similar_by_vector(ypred)[0][0]
        true_word = word2vec_model.similar_by_vector(ytrue)[0][0]
        similarity = word2vec_model.similarity(pred_word, true_word)
        if similarity == 1:
            correct += 1
        count += 1
    print("Accuracy {0}".format(correct/count))

In [None]:
#seq_out[0]

In [None]:
# accuracy()

In [None]:
#model_results = model_fit_summary.history

In [None]:
#model_results.update(model_fit_summary.params)

In [None]:
#model_results["train_accuracy"] = accuracy()

In [None]:
# n = no. of predictions
# accuracy = accuracy(400)
#print(model_results)

In [None]:
#text_file_path = "../weights/lstm-2-1024-512-batchsize-128-epochs-25/model_results.json"

In [None]:
#with open(text_file_path, "w") as f:
        #json.dump(model_results, f)
        