In [None]:
# !pip install pyvi

In [1]:
import numpy as np
import pandas as pd
import string
import nltk
import re
from pyvi import ViTokenizer
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, Input
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [2]:
import os
dir = 'data/chatbot'
filenames = os.listdir(dir)
print(filenames)

['bạn bè.txt', 'các câu hỏi phức tạp.txt', 'du lịch.txt', 'gia đình.txt', 'giải trí.txt', 'học tập.txt', 'nghề nghiệp.txt', 'nghỉ lễ.txt', 'người yêu.txt', 'robot.txt', 'shoping.txt', 'sở thích.txt', 'tdtu.txt', 'thông tin cá nhân.txt', 'trò chuyện về đi ăn.txt', 'tán gẫu.txt', 'đất nước.txt', 'địa chỉ.txt']


In [5]:
Q = []
A = []
for i in filenames:
    with open(dir+'/'+i, encoding="utf8") as f:
        for line in f:
            a =  line.split('__eou__')
            Q.append(a[0])
            A.append(a[1])
print('Number of QA pairs:', len(Q))

Number of QA pairs: 5855


In [4]:
# word segment, clean punctuation of a sentence
def normalize(sent):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return ViTokenizer.tokenize(regex.sub('', sent.lower()))

In [3]:
stop_words = np.loadtxt('data/vietnamese-stopwords-dash.txt', dtype=str)
def clean_stop_words(sent):
    clean_sent = ''
    for w in sent.split():
        if w not in stop_words:
            clean_sent += w + ' '
    return clean_sent.strip()

In [6]:
# normalize questions and answers
questions = []
answers = []
for i in range(len(Q)):
    questions.append(clean_stop_words(normalize(Q[i]))) # only clean stop words from questions
    answers.append(normalize(A[i]))

In [7]:
# start and end of string token to each answer
for i in range(len(answers)):
    answers[i] = '<start> ' + answers[i] + ' <end>'

In [8]:
# count each token appearing time 
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print('VOCAB SIZE: {}'.format(VOCAB_SIZE))
vocab = list(tokenizer.word_index.keys()) # list of tokens

VOCAB SIZE: 4927


In [9]:
# due to stop words cleaning, there are empty questions
# => remove them from train data
tokenized_questions = tokenizer.texts_to_sequences(questions)
tokenized_answers = tokenizer.texts_to_sequences(answers)

indices = []
for i in range(len(tokenized_questions)):
    if len(tokenized_questions[i])==0:
        indices.append(i)
tokenized_questions = np.delete(np.array(tokenized_questions, dtype=object), indices)
tokenized_answers = np.delete(np.array(tokenized_answers, dtype=object), indices)
print('Number of train data after cleaning:', len(tokenized_questions))

Number of train data after cleaning: 5377


In [10]:
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')
encoder_input_data = np.array( padded_questions )
print('Encoder input data shape:', encoder_input_data.shape)
print('Max length of encoder input data:', maxlen_questions)

Encoder input data shape: (5377, 18)
Max length of encoder input data: 18


In [11]:
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_input_data = np.array(padded_answers)
print('Encoder input data shape:', decoder_input_data.shape)
print('Max length of encoder input data:', maxlen_answers)

Encoder input data shape: (5377, 113)
Max length of encoder input data: 113


In [12]:
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
one_hot_answers = to_categorical(padded_answers, VOCAB_SIZE, dtype='uint8')
decoder_output_data = np.array(one_hot_answers)
print('Decoder ouput data shape:', decoder_output_data.shape)

Decoder ouput data shape: (5377, 113, 4927)


In [None]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True) (encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(200, return_state=True) (encoder_embedding)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True) (decoder_inputs)
decoder_lstm = LSTM(200, return_state=True, return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='softmax') 
output = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 200)    985400      input_5[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    985400      input_6[0][0]                    
____________________________________________________________________________________________

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, epochs=5) # epochs>=10, run 2 times

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3ddb710350>

In [None]:
def make_inference_models():
    encoder_model = Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = Input(shape=(200,))
    decoder_state_input_c = Input(shape=(200,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    return encoder_model, decoder_model

In [None]:
def format_question(sent:str):
    words = clean_stop_words(normalize(sent)).split()
    tokens_list = [tokenizer.word_index[w] for w in words]
    return pad_sequences([tokens_list], maxlen=maxlen_questions, padding='post')

In [None]:
enc_model, dec_model = make_inference_models()

In [None]:
def ask(question):
    states_values = enc_model.predict(format_question(question))
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['<start>']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition:
        dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items() :
            if sampled_word_index == index:
                decoded_translation += '{} '.format(word)
                sampled_word = word
        
        if sampled_word == '<end>' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros((1, 1))  
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]
    return decoded_translation[0].upper() + decoded_translation[1:].replace('_', ' ').replace('<end>', '')

In [60]:
print(ask('Bạn sinh năm bao nhiêu?'))
print(ask('Có người yêu chưa?'))
print(ask('Shop tư vấn cho mình một số mẫu giày thích hợp để đi chơi được không?'))
print(ask('Quê bạn ở đâu?'))
print(ask('Bạn thường làm gì trong thời gian rảnh?'))
print(ask('Bạn có muốn đi xem phim vào cuối tuần này hay không?'))
print(ask('Bạn thích môn thể thao gì?'))

Mình sinh năm 2001  
Mình chưa có người yêu  
Mình nghĩ là được  
Quê mình ở tiền giang  
Mình thích chơi game  
Mình đi xem phim  
Mình thích chơi game  
