In [None]:
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import pandas as pd
import nltk

In [None]:
fname = 'wordVec.bin'
word_vectors = KeyedVectors.load(fname)

In [None]:
data = pd.read_csv('maroon5.csv')

In [None]:
data.head()

In [None]:
vocab = list(word_vectors.vocab.keys())

In [None]:
word_map = {}
word_map_rev = {}
for i,v in enumerate(vocab):
    
    word_map[v] = i
    word_map_rev[i] = v

In [None]:
X = []
Y = []
for index,row in data.iterrows():
    i = 10
    cur_row = nltk.word_tokenize(row['text'])
    cur_x = cur_row[:-1]
    cur_x.insert(0,'STARTPAD') #Shifting it ahead by 1 timestep
    cur_y = cur_row   #1 timestep behind cur_x
    while i<=len(cur_row):
        
        if i+10<len(cur_row):
            next_x = cur_x[i-10:i+1]
            next_y = cur_y[i-10:i+1]
        if '.' not in next_y and '.' not in next_x :
            X.append(next_x)
            Y.append(next_y)
        
        i += 10

In [None]:
#Convert X and Y to their word embeddings 

x_data = []
y_data = []

for i in range(len(X)):
    temp_x = []
    temp_y = []
    
    for j in range(len(X[i])):
        
        temp_x.append(word_vectors[X[i][j]])
        temp_y.append(word_map[Y[i][j]])
    x_data.append(np.array(temp_x))
    y_data.append(np.array(temp_y))


In [None]:
x_data = np.array(x_data)
y_data = np.array(y_data)

In [None]:
from keras.utils import to_categorical
y_data = to_categorical(y_data,num_classes = len(vocab))

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1)

In [None]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
#Building the model

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Flatten,Dense, TimeDistributed, Dropout,Bidirectional
from keras.layers.advanced_activations import LeakyReLU, PReLU, ELU

model = Sequential()
model.add(LSTM(units=120,return_sequences=True,dropout=0.3,recurrent_dropout=0.3,input_shape=(11,100)))
model.add(Bidirectional(LSTM(units=100,return_sequences=True,dropout=0.3)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(len(vocab),activation='softmax')))

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=5, epochs=200, validation_split=0.1, verbose=1,shuffle=True)

In [None]:
model_json = model.to_json()
with open("Model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Model/model.h5")
print("Saved model to disk")

In [None]:
score,acc = model.evaluate(x_test,y_test, verbose = 2, batch_size = 15)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
def print_sentence(mat):
    s = ''
    for i in mat:
        ind = np.argmax(i)
        s = s + ' ' + word_map_rev[ind]
    print(s)

In [None]:
def generate():
    words = np.random.random_sample(size=(1,11,100))
    words[0][0] = word_vectors['Hey']
    k = model.predict(words)
    print_sentence(k[0])

In [None]:
generate()