In [23]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GRU

# Data Preprocessing

In [24]:
data_path = 'cmn.txt'

In [25]:
NUM_SAMPLES = 10000

In [26]:
df = pd.read_table(data_path,header=None).iloc[:NUM_SAMPLES,:,]
df.columns=['english','chinese']
df['english'] = df['english'].apply(lambda x: '\t'+x+'\n') # add \t and \n as bos and eos symbol

English_texts = df.english.values.tolist()
Chinese_texts = df.chinese.values.tolist()

English_corpus = sorted(list(set(df.english.unique().sum())))
Chinese_corpus = sorted(list(set(df.chinese.unique().sum())))

In [27]:
IN_LEN = max([len(i) for i in Chinese_texts])
OUT_LEN = max([len(i) for i in English_texts])
INPUT_DICTSIZE = len(Chinese_corpus)
OUTPUT_DICTSIZE = len(English_corpus)

In [28]:
N_unit = 256
BatchSize = 64
EmbedSize = 128

In [29]:
encoder_input = np.zeros((NUM_SAMPLES,IN_LEN,INPUT_DICTSIZE),dtype='float32')
decoder_input = np.zeros((NUM_SAMPLES,OUT_LEN,OUTPUT_DICTSIZE),dtype='float32')
decoder_output = np.zeros((NUM_SAMPLES,OUT_LEN,OUTPUT_DICTSIZE),dtype='float32')

In [30]:
input_dict = {char:index for index,char in enumerate(Chinese_corpus)}
input_dict_reverse = {index:char for index,char in enumerate(Chinese_corpus)}
target_dict = {char:index for index,char in enumerate(English_corpus)}
target_dict_reverse = {index:char for index,char in enumerate(English_corpus)}

In [31]:
for seq_index,seq in enumerate(Chinese_texts):
    for char_index, char in enumerate(seq):
        encoder_input[seq_index,char_index,input_dict[char]] = 1

In [32]:
for seq_index,seq in enumerate(English_texts):
    for char_index,char in enumerate(seq):
        decoder_input[seq_index,char_index,target_dict[char]] = 1
        if char_index > 0:
            decoder_output[seq_index,char_index-1,target_dict[char]] = 1

# Training Model

In [33]:
# define encoder

def create_encoder(inputdim, inputlen, n_units):
    
    encoder_input = Input((inputlen,inputdim,))
    encoder = GRU(n_units, return_state = True)
    _, encoder_state = encoder(encoder_input)
    
    encoder=Model(encoder_input, encoder_state)
    
    return encoder

In [34]:
# define decoder. notice that this model is only used in training

def create_decoder(inputdim,inputlen, n_units):
    # input of answers
    decoder_input = Input((inputlen,inputdim,))
    # input of encoder state vectors
    initial_state = Input((n_units,))
    decoder = GRU(n_units, return_sequences = True, return_state = True)
    # the state we dont need in training model
    decoder_output,_ = decoder(decoder_input,initial_state = initial_state)
    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_state],decoder_output_)
    
    return decoder

In [35]:
encoder=create_encoder(INPUT_DICTSIZE,IN_LEN,N_unit)

In [36]:
encoder.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 20, 2621)]        0         
_________________________________________________________________
gru_2 (GRU)                  [(None, 256), (None, 256) 2211072   
Total params: 2,211,072
Trainable params: 2,211,072
Non-trainable params: 0
_________________________________________________________________


In [37]:
decoder=create_decoder(OUTPUT_DICTSIZE,OUT_LEN,N_unit)

In [38]:
decoder.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 32, 75)]     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
gru_3 (GRU)                     [(None, 32, 256), (N 255744      input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 32, 75)       19275       gru_3[0][0]                

In [39]:
# define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real,pred)
    return loss

In [40]:
# define the training step

@tf.function
def train_step(enc_inp,dec_inp,dec_out):
    loss=0

    with tf.GradientTape() as tape:
        encoder_state=encoder(enc_inp)
        initial_state=encoder_state
        prediction=decoder([dec_inp,initial_state])
        loss=loss_function(dec_out,prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return tf.reduce_mean(loss)

In [41]:
# define the validation loss

def validation_loss(enc_inp,dec_inp,dec_out):
    loss=0
    encoder_state=encoder(enc_inp)
    initial_state=encoder_state
    prediction=decoder([dec_inp,initial_state])
    loss=loss_function(dec_out,prediction)
    
    return loss

In [42]:
# create tensorflow dataset pipeline for faster processing
# training set
dataset_train = tf.data.Dataset.from_tensor_slices((encoder_input,decoder_input,decoder_output)).shuffle(NUM_SAMPLES)
dataset_train = dataset_train.batch(BatchSize, drop_remainder=True)

In [43]:
# train the model

Epochs = 100
trainstep_epoch = NUM_SAMPLES//BatchSize
overalltime=0

for epoch in range(Epochs):
    start=time.time()
    total_loss=0
    valid_loss=0
    
    for (batch, (inputq,inputa,targa)) in enumerate(dataset_train.take(trainstep_epoch)):
        batch_loss = train_step(inputq,inputa,targa)
        total_loss += batch_loss
    #print(total_loss)
    print('Epoch {} Loss {:.3f}'.format(epoch+1,total_loss/trainstep_epoch))
    
    stop=time.time()
    timetaken=stop-start
    print('Time taken for 1 epoch {} sec\n'.format(timetaken))
    
    overalltime+=timetaken
    
print('Overall time taken {} min\n'.format(overalltime/60))

Epoch 1 Loss 2.110
Time taken for 1 epoch 9.173510074615479 sec

Epoch 2 Loss 1.618
Time taken for 1 epoch 7.700208425521851 sec

Epoch 3 Loss 1.500
Time taken for 1 epoch 7.196533918380737 sec

Epoch 4 Loss 1.423
Time taken for 1 epoch 7.318495750427246 sec

Epoch 5 Loss 1.348
Time taken for 1 epoch 6.894978046417236 sec

Epoch 6 Loss 1.275
Time taken for 1 epoch 7.033463001251221 sec

Epoch 7 Loss 1.213
Time taken for 1 epoch 7.469824552536011 sec

Epoch 8 Loss 1.155
Time taken for 1 epoch 7.172989845275879 sec

Epoch 9 Loss 1.103
Time taken for 1 epoch 7.340726137161255 sec

Epoch 10 Loss 1.054
Time taken for 1 epoch 7.375998258590698 sec

Epoch 11 Loss 1.007
Time taken for 1 epoch 7.183519601821899 sec

Epoch 12 Loss 0.966
Time taken for 1 epoch 7.123806476593018 sec

Epoch 13 Loss 0.928
Time taken for 1 epoch 7.119802236557007 sec

Epoch 14 Loss 0.894
Time taken for 1 epoch 7.241326332092285 sec

Epoch 15 Loss 0.863
Time taken for 1 epoch 7.037534236907959 sec

Epoch 16 Loss 0.833

In [44]:
# save parameters after training
encoder.save_weights('./data/gru_enc_test.h5')
decoder.save_weights('./data/gru_dec_test.h5')