In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GRU
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time

In [2]:
# import preprocessed data

with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz') 

In [3]:
# define encoder

def create_encoder(inputdim, embeddingsize, inputlen, n_units):

    # encoder
    encoder_input = Input((inputlen,))
    encoder_embed = Embedding(inputdim+1, embeddingsize)(encoder_input)
    # we use embedding layer to vectorize the word. the inputdim indicates the input dimension
    # specifically, the inputdim is the length of vocabulary
    # inputdim+1 at here is because our word label start from 1 and the 0 in input means something
    # we can consider it as a classification problem, the label 1-8000 represent 8000 words and 0 represents padding stuff
    # and the embeddingsize indicates the output dimension, which is the dimension of word vectors
    encoder = GRU(n_units, return_state = True)
    # n_units indicates the number of LSTM units 
    # the state is the vector which encoder maps input to
    # discard the output of encoder, only state vectors are we need
    _, encoder_state = encoder(encoder_embed)
    
    encoder=Model(encoder_input, encoder_state)
    
    return encoder

In [4]:
# define decoder. notice that this model is only used in training

def create_decoder(inputdim, embeddingsize,inputlen, n_units):
    # input of answers
    decoder_input = Input((inputlen,))
    # input of encoder state vectors
    initial_state = Input((n_units,))

    # vectorize input answers
    decoder_embed = Embedding(inputdim+1, embeddingsize)(decoder_input)
    decoder = GRU(n_units, return_sequences = True, return_state = True)
    # the state we dont need in training model
    decoder_output,_ = decoder(decoder_embed,initial_state = initial_state)
    # softmax layer, predict the most potential sentence of reply
    decoder_dense = Dense(inputdim+1, activation = 'softmax')
    # at here +1 again because we consider the output 0 also as a label which means the padding stuff in sentences
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_state],decoder_output_)
    
    return decoder

In [5]:
# define hyperparameters

BatchSize = 32 # we choose a small size because of the oom issue
N_Unit = 256
EmbeddingSize = 128
VocabSize = 8000 
# theoretically, vocabulary size should be len(question_corpus.word_index)+1. 
# however, seems like the 'num_words' didnt filter the tokenizer. so we assign the number manually
QuestionLen = npzfile['arr_0'].shape[1]
AnswerLen = npzfile['arr_1'].shape[1]

In [6]:
encoder=create_encoder(VocabSize,EmbeddingSize,QuestionLen,N_Unit)

In [7]:
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 12, 128)           1024128   
_________________________________________________________________
gru (GRU)                    [(None, 256), (None, 256) 296448    
Total params: 1,320,576
Trainable params: 1,320,576
Non-trainable params: 0
_________________________________________________________________


In [8]:
decoder=create_decoder(VocabSize,EmbeddingSize,AnswerLen,N_Unit)

In [9]:
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 11, 128)      1024128     input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
gru_1 (GRU)                     [(None, 11, 256), (N 296448      embedding_1[0][0]                
                                                                 input_3[0][0]              

In [10]:
# define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real,pred) # the categorical crossentropy expect the ytrue to be onehot
    return tf.reduce_mean(loss)                             # so later we will use the onehot encoding to vectorize the data

In [11]:
# define the training step

@tf.function
def train_step(enc_inp,dec_inp,targ):
    loss=0

    with tf.GradientTape() as tape:
        encoder_state=encoder(enc_inp)
        initial_state=encoder_state
        prediction=decoder([dec_inp,initial_state])
        loss=loss_function(targ,prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [12]:
# get the training data
inputq=npzfile['arr_0']
inputa=npzfile['arr_1']
targa=np.zeros_like(inputa) # create target data to do teacher forcing training
targa[:,0:-1]=inputa[:,1:]  # the target is same as the input answers but 1 timestep shifted to the left

In [13]:
# use onehot encoding to vectorize the target data
def onehotencoding(matrix,dim):
    onehot=np.zeros((matrix.shape[0],matrix.shape[1],dim))
    for i,sequence in enumerate(matrix):
        for j,index in enumerate(sequence):
            onehot[i][j][index]=1
    return onehot

In [14]:
# create tensorflow dataset pipeline for faster processing
BufferSize = len(npzfile['arr_0'])
dataset = tf.data.Dataset.from_tensor_slices((inputq,inputa,targa)).shuffle(BufferSize)
dataset = dataset.batch(BatchSize, drop_remainder=True)

In [15]:
# train the model

Epochs = 30
steps_per_epoch = len(inputq)//BatchSize
overalltime=0

for epoch in range(Epochs):
    start=time.time()
    total_loss=0
    
    for (batch, (inputq,inputa,targa)) in enumerate(dataset.take(steps_per_epoch)):
        targa_onehot=onehotencoding(targa,VocabSize+1)
        batch_loss = train_step(inputq,inputa,targa_onehot)
        total_loss += batch_loss
                
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    
    stop=time.time()
    timetaken=stop-start
    print('Time taken for 1 epoch {} sec\n'.format(timetaken))
    
    overalltime+=timetaken
    
print('Overall time taken {} min\n'.format(overalltime/60))

Epoch 1 Loss 2.2898
Time taken for 1 epoch 237.3299663066864 sec

Epoch 2 Loss 1.9766
Time taken for 1 epoch 222.4675109386444 sec

Epoch 3 Loss 1.8645
Time taken for 1 epoch 223.07515931129456 sec

Epoch 4 Loss 1.7674
Time taken for 1 epoch 222.40845394134521 sec

Epoch 5 Loss 1.6776
Time taken for 1 epoch 222.17128920555115 sec

Epoch 6 Loss 1.5927
Time taken for 1 epoch 222.36492943763733 sec

Epoch 7 Loss 1.5131
Time taken for 1 epoch 222.143385887146 sec

Epoch 8 Loss 1.4389
Time taken for 1 epoch 221.87543511390686 sec

Epoch 9 Loss 1.3692
Time taken for 1 epoch 222.6896731853485 sec

Epoch 10 Loss 1.3040
Time taken for 1 epoch 223.28253650665283 sec

Epoch 11 Loss 1.2417
Time taken for 1 epoch 222.8508541584015 sec

Epoch 12 Loss 1.1836
Time taken for 1 epoch 222.83259224891663 sec

Epoch 13 Loss 1.1285
Time taken for 1 epoch 223.16070771217346 sec

Epoch 14 Loss 1.0768
Time taken for 1 epoch 222.4200155735016 sec

Epoch 15 Loss 1.0297
Time taken for 1 epoch 222.71728372573853 s

In [16]:
# save parameters after training
encoder.save_weights('./trained_model/gru_enc_weights_30.h5')
decoder.save_weights('./trained_model/gru_dec_weights_30.h5')