In [38]:
import warnings
warnings.filterwarnings('ignore')

import gc
import numpy as np
import math
import os
import pickle
import time

from keras.initializers import glorot_uniform as keras_glorot_uniform
from keras.initializers import orthogonal as keras_orthogonal
from keras.initializers import uniform as keras_uniform
from keras.layers import Input
from keras.layers.core import Dense
from keras.layers.core import Masking
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.utils import np_utils

In [39]:
# Prepeare data set

with open('data/words.pickle', 'rb') as f:
    words = pickle.load(f)
    
enc_input = np.load('data/enc_input.npy')
dec_input = np.load('data/dec_input.npy')
lbl_input = np.load('data/lbl_input.npy')
maxlen_e, maxlen_d = np.load('data/maxlen.npy')


# Split data for training and test
sep_idx = int(enc_input.shape[0] * 0.95)
enc_train, enc_test = np.vsplit(enc_input, [sep_idx])
dec_train, dec_test = np.vsplit(dec_input, [sep_idx])
lbl_train, lbl_test = np.vsplit(lbl_input, [sep_idx])

print(enc_train.shape) # ?? 3rd dimension needed ??

num = enc_train.shape[0]
enc_train = enc_train.reshape(num, maxlen_e)
dec_train = dec_train.reshape(num, maxlen_d)
lbl_train = lbl_train.reshape(num, maxlen_d)

print(enc_train.shape)


(78881, 50, 1)
(78881, 50)


In [43]:

class Dialog:
    
    def __init__(
        self,
        maxlen_e,
        maxlen_d,
        num_input,
        num_output,
        dim_vec,
        dim_hidden_vec
    ):
        self.maxlen_e = maxlen_e
        self.maxlen_d = maxlen_d
        self.num_input = num_input
        self.num_output = num_output
        self.dim_vec = dim_vec
        self.dim_hidden_vec = dim_hidden_vec
        
        
        
    def create_model(self):        # Used in train() and prediction
        
        print('\n...Start creating models.\n')
        
        #
        # Encoder model creation
        #
        
        enc_input = Input(shape=(self.maxlen_e,), dtype='int32', name='encoder_input')
        tf_tensor = Embedding(
                              input_dim=self.num_input, 
                              output_dim=self.num_output, 
                              mask_zero=True, # ID 0 in Input data becomes a padding.(LSTM layer just passes the previous values)
                              embeddings_initializer=keras_uniform(seed=12345)
                             )(enc_input)
        
        # axis -1 counts dimension from the highest rank
        tf_tensor = BatchNormalization(axis=-1)(tf_tensor)
        
        # When all IDs in a timestep is equal to mask_value, skipped in all downstream layers
        # Maybe input data unlikely has a sequence whose all values are maske_value as corpus is parsed in such a way
        tf_tensor = Masking(mask_value=0.0)(tf_tensor)
        
        enc_output, enc_hidden_state, enc_cell_state = LSTM(
                                                            units=self.dim_hidden_vec,
                                                            kernel_initializer=keras_glorot_uniform(seed=12345),
                                                            # Coefficient to the orthogonal matrix is 1.0
                                                            recurrent_initializer=keras_orthogonal(gain=1.0, seed=12345),
                                                            dropout=0.5,
                                                            recurrent_dropout=0.5,
                                                            return_state=True,
                                                           )(tf_tensor)
        
        enc_model  = Model(inputs=enc_input, outputs=[enc_output, enc_hidden_state, enc_cell_state])

        print('Encoder model created.\n')
        
        #
        # Decoder training model creation
        #
        
        dec_input = Input(shape=(self.maxlen_d,), dtype='int32', name='decoder_input')
        tf_tensor = Embedding(
                              input_dim=self.num_input,
                              output_dim=self.dim_vec,
                              mask_zero=True,
                              embeddings_initializer=keras_uniform(seed=12345)
                             )(dec_input)
        
        tf_tensor = BatchNormalization(axis=-1)(tf_tensor)
        dec_LSTM_input = Masking(mask_value=0.0)(tf_tensor)
        
        # dec_LSTM is used later again
        dec_LSTM = LSTM(
                        units=self.dim_hidden_vec,
                        kernel_initializer=keras_glorot_uniform(seed=12345),
                        recurrent_initializer=keras_orthogonal(gain=1.0, seed=12345),
                        dropout=0.5,
                        recurrent_dropout=0.5,
                        return_state=True,
                        return_sequences=True,
                       )
        tf_tensor, _, _ = dec_LSTM(dec_LSTM_input, initial_state=[enc_hidden_state, enc_cell_state])
        
        # Densely connected NN after LSTM is used later again
        dec_Dense = Dense(
                          units=self.num_output,
                          activation='softmax',
                          kernel_initializer=keras_glorot_uniform(seed=12345)
                         ) 
        dec_output = dec_Dense(tf_tensor)
        
        model = Model(inputs=[enc_input, dec_input], outputs=dec_output)
        model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['categorical_accuracy'])
        
        print('Decoder training model created.\n')
        
        #
        # Decoder model creation
        #
        
        dec_hidd_state_input = Input(shape=(self.dim_hidden_vec,), name='hidd_state_input')
        dec_cell_state_input = Input(shape=(self.dim_hidden_vec,), name='cell_state_input')
        
        
        tf_tensor, dec_hidden_state, dec_cell_state = dec_LSTM(
                                                                dec_LSTM_input, 
                                                                initial_state=[dec_hidd_state_input, dec_cell_state_input]
                                                                )
        dec_response = dec_Dense(tf_tensor)
        
        dec_model = Model(
                          inputs =[dec_input, dec_hidd_state_input, dec_cell_state_input],
                          outputs=[dec_response, dec_hidden_state, dec_cell_state]
                         )
        
        print('Decoder model created.\n')
        
        return model, enc_model, dec_model
    


    def train(self, enc_train, dec_train, lbl_train, epochs, batch_size, param_file):

        model, _, _ = self.create_model()

        if os.path.isfile(param_file):
            model.load_weights(param_file)

        for v in [enc_train, dec_train, lbl_train]:
            np.random.seed(12345)
            np.random.shuffle(v)
            
        
        # Split data for training and validation
        sep_idx = int(enc_train.shape[0] * 0.9)
        enc_train, enc_validate = np.vsplit(enc_input, [sep_idx])
        dec_train, dec_validate = np.vsplit(dec_input, [sep_idx])
        lbl_train, lbl_validate = np.vsplit(lbl_input, [sep_idx])
        
        
        # ?? Dimension changed from (N, maxlen) (N, maxlen, 1) ??
        enc_train    = enc_train.reshape(enc_train.shape[0], self.maxlen_e)
        enc_validate = enc_validate.reshape(enc_validate.shape[0], self.maxlen_e)
        dec_train    = dec_train.reshape(dec_train.shape[0], self.maxlen_d)
        dec_validate = dec_validate.reshape(dec_validate.shape[0], self.maxlen_d)
        lbl_train    = lbl_train.reshape(lbl_train.shape[0], self.maxlen_d)
        lbl_validate = lbl_validate.reshape(lbl_validate.shape[0], self.maxlen_d)
        

        early_stop_cap = 10000
        
        for epoch_idx in range(0, epochs):
            
            print('Epoch ', epoch_idx+1, '/', epochs) 
            validate_perplexity = self.on_batch(
                                                model, 
                                                epoch_idx, 
                                                enc_train,
                                                dec_train,
                                                lbl_train,
                                                enc_validate,
                                                dec_validate,
                                                lbl_validate,
                                                batch_size
                                               )
            #EarlyStopping
            if epoch_idx == 0 or validate_perplexity <= early_stop_cap:
                  early_stop_cap = validate_perplexity
            else:
                print('Early stopping\n') 
                break
                  
        return model



    def on_batch(self,     # Used in train()
         model, 
         epoch_idx, 
         enc_train,
         dec_train,
         lbl_train,
         enc_validate,
         dec_validate,
         lbl_validate,
         batch_size
    ):
                 
        list_loss = []
        list_acc  = []
        num_train = enc_train.shape[0]

        time_start = time.time()
        
        for step in range(0, math.ceil(num_train / batch_size)):
            if step > 0:break
            
            s = batch_size * step
            e = min(num_train, batch_size * (step + 1))
            
            enc_batch = enc_train[s:e, :]
            dec_batch = dec_train[s:e, :]
            lbl_batch = lbl_train[s:e, :]

            
            # turns IDs into binary values
            lbl_batch = np_utils.to_categorical(lbl_batch, self.num_output)
            
            # train_on_batch returns [Loss scalar, Accuracy scalar]
            result = model.train_on_batch([enc_batch, dec_batch], lbl_batch)
            
            list_loss.append(result[0])
            list_acc.append(result[1])
            
            print(
                  '\n', e, '/', num_train, '\t', str(int(time.time() - time_start)) + 'sec',
                  '\t', 'Loss :', "{0:.4f}".format(np.average(list_loss)),
                  '\t', 'Acc :',  "{0:.4f}".format(np.average(list_acc))
                 )
            
            del enc_batch, dec_batch, lbl_batch
        
        del list_loss, list_acc
        
        return self.eval_perplexity(
                    model,
                    enc_validate,
                    dec_validate,
                    lbl_validate,
                    batch_size
                )



    def eval_perplexity(self,    # Used in on_batch() and test
        model,
        enc_validate,
        dec_validate,
        lbl_validate,
        batch_size
    ):
        
        list_loss = []
        num_validate = enc_validate.shape[0]
        
        num_loss = 0
        sum_loss = 0

        time_start = time.time()
        
        for step in range(0, math.ceil(num_validate / batch_size)):
            if step > 1:break
            
            s = batch_size * step
            e = min(num_validate, batch_size * (step + 1))
            
            enc_batch = enc_validate[s:e, :]
            dec_batch = dec_validate[s:e, :]
            lbl_batch = lbl_validate[s:e, :]

            
            # turns IDs into binary values
            lbl_batch = np_utils.to_categorical(lbl_batch, self.num_output)
            
            # Reshape the dimension into (1, N * maxlen) for cross entropy Loss dot product
            lbl_batch = lbl_batch.reshape(1,(e-s) * self.maxlen_d * self.num_output)
            
# #             mask = np.zeros((e-s, self.maxlen_d, self.num_output), dtype=np.float32)
# #             print(mask.shape)

            for i in range(0,e-s):
                num_dim = self.maxlen_d - list(dec_batch[i, :]).count(0.)
                num_loss += num_dim   # Only take the number of tokens(IDs) excluding the paddings

            Y1 = model.predict_on_batch([enc_batch, dec_batch])
            Y2 = np.maximum(Y1, 1e-7)   # Replace the values less than 1e-7 with 1e-7
            Y2 = np.log(Y2)
            Y3 = Y2.reshape(1, (e-s) * self.maxlen_d * self.num_output)
            
            loss = np.dot(Y3, lbl_batch.T) # Shape (1, 1)
            sum_loss += loss[0, 0]
            
            perplexity = np.exp(sum_loss / num_loss)
            
            print(
                  '\n', e, '/', num_validate, '\t', str(int(time.time() - time_start)) + 'sec',
                  '\t', 'Perplexity :',  "{0:.4f}".format(perplexity)
                 )
            
            del enc_batch, dec_batch, lbl_batch
            del Y1, Y2, Y3
            gc.collect()
        
        return 10000
    
        
    
    def sampling_detoknize(self):
        pass
    

In [44]:
epochs = 1
batch_size = 100
param_file = 'param_seq2seq.hdf5'

num_input = len(words)
num_output = num_input
dim_vec = 400
dim_hidden_vec = int(dim_vec * 2)


prediction = Dialog(maxlen_e, maxlen_d, num_input, num_output, dim_vec, dim_hidden_vec)

model = prediction.train(enc_train, dec_train, lbl_train, epochs, batch_size, param_file)



...Start creating models.

Encoder model created.

Decoder training model created.

Decoder model created.

Epoch  1 / 1

 100 / 70992 	 67sec 	 Loss : 8.8770 	 Acc : 0.0000

 100 / 12041 	 14sec 	 Perplexity : 0.0000

 200 / 12041 	 28sec 	 Perplexity : 0.0000

 300 / 12041 	 39sec 	 Perplexity : 0.0000

 400 / 12041 	 50sec 	 Perplexity : 0.0000

 500 / 12041 	 62sec 	 Perplexity : 0.0000

 600 / 12041 	 73sec 	 Perplexity : 0.0000

 700 / 12041 	 85sec 	 Perplexity : 0.0000

 800 / 12041 	 96sec 	 Perplexity : 0.0000

 900 / 12041 	 108sec 	 Perplexity : 0.0000

 1000 / 12041 	 119sec 	 Perplexity : 0.0000

 1100 / 12041 	 130sec 	 Perplexity : 0.0000

 1200 / 12041 	 141sec 	 Perplexity : 0.0000

 1300 / 12041 	 153sec 	 Perplexity : 0.0000

 1400 / 12041 	 165sec 	 Perplexity : 0.0000

 1500 / 12041 	 177sec 	 Perplexity : 0.0000

 1600 / 12041 	 188sec 	 Perplexity : 0.0000

 1700 / 12041 	 200sec 	 Perplexity : 0.0000

 1800 / 12041 	 212sec 	 Perplexity : 0.0000

 1900 / 12041