In [1]:
import pandas as pd
import numpy as np

In [2]:
#Loading the data
data = pd.read_pickle('Processed_data/data_processed.csv')

#We have loaded the data in the form of sequence matrices
#The raw data set was a set of 37,000 quotes and proverbs : Raw_data/data.json
#Run the script 'process_data.py' to convert the raw data set of quotes to sequence matrices

In [3]:
#Vocabulary Dataset
#Run 'process_data.py' to obtain it

vocab = pd.read_csv('Processed_Data/vocab.csv')
vocab = vocab['word'].values

n_words = len(vocab)
max_len = 30

In [4]:
data.head()

Unnamed: 0,X,Y
0,"[23649, 6395, 23362, 29104, 14769, 4501, 26129...","[6395, 23362, 29104, 14769, 4501, 26129, 18217..."
1,"[23649, 24495, 26895, 22088, 11786, 1120, 2451...","[24495, 26895, 22088, 11786, 1120, 24519, 1996..."
2,"[23649, 14233, 18516, 25586, 4006, 7759, 24928...","[14233, 18516, 25586, 4006, 7759, 24928, 32413..."
3,"[23649, 24072, 5439, 20298, 30530, 4911, 21714...","[24072, 5439, 20298, 30530, 4911, 21714, 22063..."
4,"[23649, 14233, 11412, 1261, 20298, 24519, 1716...","[14233, 11412, 1261, 20298, 24519, 17165, 1235..."


Now, every X training point is a sequence of words with the starting of the sentence marked as 'STARTPAD' [ID: 23649].
Each input word represents a time step t. 
So, t(1) will be the start of the sentence.
Its corresponding Y is the word from the next time step, t(n+1).

In [5]:
#Convert list to numpy arrays 

data['X'] = data['X'].apply(np.array)
data['Y'] = data['Y'].apply(np.array)

In [6]:
X = data['X'].values
Y = data['Y'].values

In [7]:
#This step doesn't matter. We use this to make our arrays of the shape (35275,30)

from keras.preprocessing.sequence import pad_sequences

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0,truncating='post')
Y = pad_sequences(maxlen=max_len, sequences=Y, padding="post", value=0,truncating='post')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
print(X.shape)
print(Y.shape)

n_sentences = X.shape[0]
print(n_sentences)

(35275, 30)
(35275, 30)
35275


In [9]:
#Split data

from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

x_test,x_val,y_test,y_val = train_test_split(x_test, y_test, test_size=0.3)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)
print(x_val.shape,y_val.shape)

y_val = to_categorical(y_val,num_classes=n_words)

(24692, 30) (24692, 30)
(7408, 30) (7408, 30)
(3175, 30) (3175, 30)


In [10]:
#Custom Generator to feed into the model

from keras.utils import to_categorical

def generator(batch_size):
    x = np.zeros((batch_size,30))
    y = np.zeros((batch_size,30))
    while True:
        
        for i in range(batch_size):
            index = np.random.randint(0,x_train.shape[0])
            x[i] = x_train[index]
            y[i] = y_train[index]
        yield x, to_categorical(y,num_classes=n_words)

### Training the model

In [11]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Flatten,Dense, TimeDistributed, Dropout

#Building a model

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(LSTM(units=100,return_sequences=True,recurrent_dropout=0.2))
model.add(TimeDistributed(Dropout(0.2)))
model.add(TimeDistributed(Dense(n_words,activation='softmax')))


model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [12]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 100)           3243500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 100)           80400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 100)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 32435)         3275935   
Total params: 6,599,835
Trainable params: 6,599,835
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
batch_size = 15
model.fit_generator(generator(batch_size),steps_per_epoch=x_train.shape[0]//batch_size,epochs=10,verbose=1,validation_data=(x_val,y_val),validation_steps=211)

Epoch 1/10
  62/1646 [>.............................] - ETA: 2:43:37 - loss: 8.1575 - acc: 0.2067