In [None]:
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import pandas as pd
import nltk

In [None]:
fname = 'wordVec.bin'
word_vectors = KeyedVectors.load(fname)

In [None]:
data = pd.read_json('corpus.json')

In [None]:
data = data['Quote'].unique()

In [None]:
data = pd.DataFrame({'Quote':data})

In [None]:
data.head()

In [None]:
X = []
Y = []
for index,row in data.iterrows():
    
    cur_row = nltk.word_tokenize(row['Quote'])
    
    if len(cur_row) > 8:
        remove_full_stop = cur_row[:-1]
        x = remove_full_stop[:-1]
        x.insert(0,'STARTPAD')
        X.append(x)
        Y.append(remove_full_stop)
    
        if index%5000 == 0 :
            print("--Processed : " + str(index) + " sentences--")

In [None]:
#Convert X and Y to their word embeddings 

x_data = []
y_data = []

for i in range(len(X)):
    temp_x = []
    temp_y = []
    
    for j in range(len(X[i])):
        
        temp_x.append(word_vectors[X[i][j]])
        temp_y.append(word_vectors[Y[i][j]])
    x_data.append(np.array(temp_x))
    y_data.append(np.array(temp_y))
    
    if i%5000 == 0 :
            print("--Processed : " + str(i) + " sentences--")

In [None]:
x_data = np.array(x_data)
y_data = np.array(y_data)

In [None]:
#Padding

from keras.preprocessing.sequence import pad_sequences

max_len = 25

x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=word_vectors['ENDPAD'],truncating='post')
y_data = pad_sequences(maxlen=max_len, sequences=y_data, padding="post", value=word_vectors['ENDPAD'],truncating='post')

In [None]:
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1)

In [None]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
#Building the model

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Flatten,Dense, TimeDistributed, Dropout,Bidirectional
from keras.layers.advanced_activations import LeakyReLU, PReLU, ELU

model = Sequential()
model.add(LSTM(units=100,return_sequences=True,recurrent_dropout=0.1,input_shape=(max_len,50)))
model.add(LSTM(units=100,return_sequences=True))
model.add(TimeDistributed(Dense(150,activation='tanh')))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(50,activation='linear')))

In [None]:
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=5, epochs=5, validation_split=0.1, verbose=1)

In [None]:
model_json = model.to_json()
with open("Model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Model/model.h5")
print("Saved model to disk")

In [None]:
score,acc = model.evaluate(x_test,y_test, verbose = 2, batch_size = 15)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))