In [15]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GRU

In [16]:
data_path = 'cmn.txt'
NUM_SAMPLES = 10000

In [17]:
df = pd.read_table(data_path,header=None).iloc[:NUM_SAMPLES,:,]
df.columns=['english','chinese']
df['english'] = df['english'].apply(lambda x: '\t'+x+'\n') # add \t and \n as bos and eos symbol

English_texts = df.english.values.tolist()
Chinese_texts = df.chinese.values.tolist()

English_corpus = sorted(list(set(df.english.unique().sum())))
Chinese_corpus = sorted(list(set(df.chinese.unique().sum())))

In [18]:
input_dict = {char:index for index,char in enumerate(Chinese_corpus)}
input_dict_reverse = {index:char for index,char in enumerate(Chinese_corpus)}
target_dict = {char:index for index,char in enumerate(English_corpus)}
target_dict_reverse = {index:char for index,char in enumerate(English_corpus)}

In [19]:
IN_LEN = max([len(i) for i in Chinese_texts])
OUT_LEN = max([len(i) for i in English_texts])
INPUT_DICTSIZE = len(Chinese_corpus)
OUTPUT_DICTSIZE = len(English_corpus)
N_unit = 256
BatchSize = 64
EmbedSize = 128

In [20]:
# define encoder

def create_encoder(inputdim, inputlen, n_units):
    
    encoder_input = Input((inputlen,inputdim,))
    encoder = GRU(n_units, return_state = True)
    _, encoder_state = encoder(encoder_input)
    
    encoder=Model(encoder_input, encoder_state)
    
    return encoder

In [21]:
# define decoder. notice that this model a little bit different with training model

def create_decoder(inputdim,inputlen, n_units):
    # input of answers
    decoder_input = Input((1,inputdim,))
    # input of encoder state vectors
    initial_state = Input((n_units,))
    decoder = GRU(n_units, return_sequences = True, return_state = True)
    # the state we dont need in training model
    decoder_output,decoder_state = decoder(decoder_input,initial_state = initial_state)
    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_state],[decoder_output_,decoder_state])
    
    return decoder

In [22]:
encoder=create_encoder(INPUT_DICTSIZE,IN_LEN,N_unit)

In [23]:
encoder.load_weights('./data/gru_enc_test.h5')

In [24]:
decoder=create_decoder(OUTPUT_DICTSIZE,OUT_LEN,N_unit)

In [25]:
decoder.load_weights('./data/gru_dec_test.h5')

In [26]:
def onehot(sequence):
    onehot = np.zeros((1,IN_LEN,INPUT_DICTSIZE),dtype='float32')
    for i, e in enumerate(sequence):
        onehot[0][i][e]=1
        
    return onehot

In [27]:
def evaluate(sentence):
    
    encoder_inputs=[]
    for char in sentence:
        if char in input_dict:
            encoder_inputs.append(input_dict[char])
    encoder_inputs=onehot(encoder_inputs)
        

    encoder_inputs = tf.convert_to_tensor(encoder_inputs)
    encoder_state=encoder(encoder_inputs)
    hidden_state=encoder_state
    
    decoder_inputs=np.zeros((1,1,OUTPUT_DICTSIZE),dtype='float32')
    decoder_inputs[0,0,target_dict['\t']] = 1
    
    result=''
    for t in range(OUT_LEN):
        pred,state=decoder([decoder_inputs,hidden_state])
        pred=np.squeeze(pred)
        pred_ind=tf.math.argmax(pred).numpy()
        
        if target_dict_reverse[pred_ind]=='\n':
            return result

        result+=target_dict_reverse[pred_ind]
        decoder_inputs=np.zeros((1,1,OUTPUT_DICTSIZE),dtype='float32')
        decoder_inputs[0,0,pred_ind] = 1
        hidden_state=state
    return result

In [28]:
while True:
    inputs = input('User :> ')
    if inputs == 'quit':
        break

    result = evaluate(inputs)

    print('Bot :> ' + result)

User :>  我迷失了。


Bot :> I lost.


User :>  我们来试试。


Bot :> We try.


User :>  滾！


Bot :> Get away!


User :>  我是個英雄。


Bot :> I'm a hero.


User :>  噢，真的嗎?


Bot :> Oh, really?


User :>  時光飛逝。


Bot :> Time flies.


User :>  为什么我们应该要帮助？


Bot :> Why should I throw atreed?


User :>  他是个有教养的人。


Bot :> He is a learned man.


User :>  我想要聽聽你的意見。


Bot :> I want your opinion.


User :>  我想找女朋友。


Bot :> I want a few inda saud.


User :>  祝您有一個愉快的聖誕節。


Bot :> Have a good Christmas.


User :>  我應該把我要洗的衣服放在哪裡?


Bot :> Where should I like to go out?


User :>  第一次世界大戰於1914年爆發。


Bot :> Were late for London nawe?


User :>  quit
