In [14]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time

In [15]:
# import preprocessed data

with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz') 

In [16]:
# define encoder

def create_encoder(inputdim, embeddingsize, inputlen, n_units):

    # encoder
    encoder_input = Input((inputlen,))
    encoder_embed = Embedding(inputdim+1, embeddingsize)(encoder_input)
    # we use embedding layer to vectorize the word. the inputdim indicates the input dimension
    # specifically, the inputdim is the length of vocabulary
    # and the embeddingsize indicates the output dimension, which is the dimension of word vectors
    encoder = LSTM(n_units, return_state = True)
    # n_units indicates the number of LSTM units 
    # the state h & c are the vectors which encoder maps input to
    # discard the output of encoder, only state h & c are we need
    _, encoder_h, encoder_c = encoder(encoder_embed)
    
    encoder=Model(encoder_input, [encoder_h,encoder_c])
    
    return encoder

In [17]:
# define decoder. notice that this model is only used in training

def create_decoder(inputdim, embeddingsize,inputlen, n_units):
    # input of answers
    decoder_input = Input((inputlen,))
    # input of encoder state vectors
    initial_stateh = Input((n_units,))
    initial_statec = Input((n_units,))
    encoder_state = [initial_stateh,initial_statec]
    # vectorize input answers
    decoder_embed = Embedding(inputdim+1, embeddingsize,input_length = 1)(decoder_input)
    decoder = LSTM(n_units, return_sequences = True, return_state = True)
    # the state h & c we dont need in training model
    decoder_output, _, _ = decoder(decoder_embed,initial_state = encoder_state)
    # softmax layer, predict the most potential sentence of reply
    decoder_dense = Dense(inputdim+1, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_stateh,initial_statec],decoder_output_)
    
    return decoder

In [18]:
# define hyperparameters

BatchSize = 32
N_Unit = 256
EmbeddingSize = 128
VocabSize = 8000 
# theoretically, vocabulary size should be len(question_corpus.word_index)+1. 
# however, seems like the 'num_words' didnt filter the tokenizer. so we assign the number manually
QuestionLen = npzfile['arr_0'].shape[1]
AnswerLen = npzfile['arr_1'].shape[1]

In [19]:
encoder=create_encoder(VocabSize,EmbeddingSize,QuestionLen,N_Unit)

In [20]:
encoder.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 15, 128)           1024128   
_________________________________________________________________
lstm_2 (LSTM)                [(None, 256), (None, 256) 394240    
Total params: 1,418,368
Trainable params: 1,418,368
Non-trainable params: 0
_________________________________________________________________


In [21]:
decoder=create_decoder(VocabSize,EmbeddingSize,AnswerLen,N_Unit)

In [22]:
decoder.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 16, 128)      1024128     input_6[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 256)]        0                                            
____________________________________________________________________________________________

In [23]:
# define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real,pred)
    return tf.reduce_mean(loss)

In [24]:
# define the training step

@tf.function
def train_step(enc_inp,dec_inp,targ):
    loss=0

    with tf.GradientTape() as tape:
        encoder_h,encoder_c=encoder(enc_inp)
        initial_stateh,initial_statec=encoder_h,encoder_c
        prediction=decoder([dec_inp,initial_stateh,initial_statec])
        loss=loss_function(targ,prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [25]:
# get the training data
inputq=npzfile['arr_0']
inputa=npzfile['arr_1']
targa=np.zeros_like(inputa) # create target data to do teacher forcing training
targa[:,0:-1]=inputa[:,1:]  # the target is same as the input answers but 1 timestep shifted to the left

In [None]:
# use onehot encoding to vectorize the target data
def onehotencoding(matrix,dim):
    onehot=np.zeros((matrix.shape[0],matrix.shape[1],dim))
    for i,sequence in enumerate(matrix):
        for j,index in enumerate(sequence):
            onehot[i][j][index]=1
    return onehot

In [None]:
# create tensorflow dataset pipeline for faster processing
BufferSize = len(npzfile['arr_0'])
dataset = tf.data.Dataset.from_tensor_slices((inputq,inputa,targa)).shuffle(BufferSize)
dataset = dataset.batch(BatchSize, drop_remainder=True)

In [26]:
# train the model

Epochs = 30
steps_per_epoch = len(inputq)//BatchSize
overalltime=0

for epoch in range(Epochs):
    start=time.time()
    total_loss=0
    
    for (batch, (inputq,inputa,targa)) in enumerate(dataset.take(steps_per_epoch)):
        targa_onehot=onehotencoding(targa,VocabSize+1)
        batch_loss = train_step(inputq,inputa,targa_onehot)
        total_loss += batch_loss
                
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    
    stop=time.time()
    timetaken=stop-start
    print('Time taken for 1 epoch {} sec\n'.format(timetaken))
    
    overalltime+=timetaken
    
print('Overall time taken {} min\n'.format(overalltime/60))

Epoch 1 Batch 0 Loss 4.7217
Epoch 1 Batch 1000 Loss 2.4261
Epoch 1 Batch 2000 Loss 2.4073
Epoch 1 Loss 2.3059
Time taken for 1 epoch 279.8809278011322 sec

Epoch 2 Batch 0 Loss 2.0750
Epoch 2 Batch 1000 Loss 2.5129
Epoch 2 Batch 2000 Loss 2.4733
Epoch 2 Loss 2.1048
Time taken for 1 epoch 269.29883885383606 sec

Epoch 3 Batch 0 Loss 2.0025
Epoch 3 Batch 1000 Loss 2.2912
Epoch 3 Batch 2000 Loss 1.8121
Epoch 3 Loss 2.0210
Time taken for 1 epoch 270.02436327934265 sec

Epoch 4 Batch 0 Loss 1.8412
Epoch 4 Batch 1000 Loss 1.9852
Epoch 4 Batch 2000 Loss 2.1131
Epoch 4 Loss 1.9631
Time taken for 1 epoch 272.2730939388275 sec

Epoch 5 Batch 0 Loss 1.9290
Epoch 5 Batch 1000 Loss 1.8835
Epoch 5 Batch 2000 Loss 1.6499
Epoch 5 Loss 1.9134
Time taken for 1 epoch 271.13661074638367 sec

Epoch 6 Batch 0 Loss 1.4894
Epoch 6 Batch 1000 Loss 1.7291
Epoch 6 Batch 2000 Loss 2.3504
Epoch 6 Loss 1.8753
Time taken for 1 epoch 271.6530210971832 sec

Epoch 7 Batch 0 Loss 1.6108
Epoch 7 Batch 1000 Loss 1.8915
Ep

In [27]:
# save parameters after training
encoder.save_weights('./trained_model/lstm_enc_weights.h5')
decoder.save_weights('./trained_model/lstm_dec_weights.h5')