# Sequence2Sequence - Mô hình LSTM

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from underthesea import word_tokenize 

from keras import Input, Model
from keras.layers import Embedding, LSTM, Dense
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import RMSprop

import warnings
warnings.filterwarnings("ignore")

# Tiền xử lý dữ liệu

In [None]:
# load data
df = pd.read_csv('./data.csv', usecols=[1,2])

In [None]:
# lấy ra Questions và Answers
data_questions = df['user_a'].values
data_answers = df['user_b'].values

In [None]:
# hàm để xóa các ký tự đặc biệt
def clean_text(sent):
    return re.sub(r'[!“”"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', sent)

# hàm để chuyển Word Segmentation cho tiếng Việt
def clean_and_word_segmentation(sent):
    return word_tokenize(clean_text(sent.lower()), format='text')

In [None]:
# vẽ biểu đồ thể hiện trực quan số lượng từ trong các questions
count_words_ques = [len(clean_text(ques).split()) for ques in data_questions]
counter_words_ques = Counter(count_words_ques)

# list_count_word = []
# list_count_sent = []
# for i in counter_words_ques.items():
#     #print(i)
#     list_count_word.append(i[0])
#     list_count_sent.append(i[1])
    
# # ========== draw ========== #
# plt.figure(figsize=(15,10))
# plt.bar(list_count_word,list_count_sent)
# plt.title('The graph shows the number of words in sentences')
# plt.xlabel('Number of words in a sentence')
# plt.ylabel('Number of sentences')
# plt.xticks(range(min(list_count_word), max(list_count_word),2))
# plt.show()

In [None]:
len(count_words_ques), len(data_questions)

(7804, 7804)

In [None]:
# loại bỏ những câu có số lượng từ > 30 từ
sorted_ques = []
sorted_ans = []
for i,count in enumerate(count_words_ques):
    if count <= 30:
        sorted_ques.append(data_questions[i])
        sorted_ans.append(data_answers[i])
        
# print('len sorted_ques:', len(sorted_ques))
# print('len sorted_ans:', len(sorted_ans))
# sorted_ques[:2], sorted_ans[:2]

In [None]:
# làm sạch và Word Segmentation cho sorted_ques và sorted_ans
questions = [clean_and_word_segmentation(ques) for ques in sorted_ques]
answers = ['<START> '+ clean_and_word_segmentation(answ) + ' <END>' for answ in sorted_ans]

questions[:2], answers[:2]

(['có bao_nhiêu loại tour khác nhau', 'tour hạng vip có_giá bao_nhiêu'],
 ['<START> có ba loại tour khác nhau hạng vip hạng tiêu_chuẩn và hạng phổ_thông <END>',
  '<START> tour hạng vip có_giá là 900 đơn_vị_tiền_tệ <END>'])

In [None]:
# tokenize cho questions và answers
tokenizer = Tokenizer(filters='', lower=False)  # filters='' do dữ liệu đã được làm sạch và giữ lại word segmention
tokenizer.fit_on_texts(questions + answers)

In [None]:
# for word_index in tokenizer.word_index.items():
#     print(word_index)

In [None]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(f'Vocabulary size : {VOCAB_SIZE}')

Vocabulary size : 5043


In [None]:
np.shape(questions)

(7792,)

In [None]:
# encoder
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = 15
encoder_inp = pad_sequences(tokenized_questions,maxlen=maxlen_questions,padding='post')

print(encoder_inp.shape)
print(questions[:5])
print(tokenized_questions[:5])
print(encoder_inp[:5])

(7792, 15)
['có bao_nhiêu loại tour khác nhau', 'tour hạng vip có_giá bao_nhiêu', 'giá của tour hạng tiêu_chuẩn là gì', 'tour hạng phổ_thông có_giá là bao_nhiêu', 'tour nào có số_lượng người tham_gia tối_đa']
[[4, 26, 211, 16, 165, 157], [16, 40, 121, 141, 26], [65, 38, 16, 40, 126, 10, 7], [16, 40, 168, 141, 10, 26], [16, 13, 4, 209, 29, 218, 281]]
[[  4  26 211  16 165 157   0   0   0   0   0   0   0   0   0]
 [ 16  40 121 141  26   0   0   0   0   0   0   0   0   0   0]
 [ 65  38  16  40 126  10   7   0   0   0   0   0   0   0   0]
 [ 16  40 168 141  10  26   0   0   0   0   0   0   0   0   0]
 [ 16  13   4 209  29 218 281   0   0   0   0   0   0   0   0]]


In [None]:
# decoder
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = np.max([len(x) for x in tokenized_answers])
decoder_inp = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

print(decoder_inp.shape)
print(answers[0])
print(tokenized_answers[0])
print(decoder_inp[0])

(7792, 50)
<START> có ba loại tour khác nhau hạng vip hạng tiêu_chuẩn và hạng phổ_thông <END>
[1, 4, 287, 211, 16, 165, 157, 40, 121, 40, 126, 42, 40, 168, 2]
[  1   4 287 211  16 165 157  40 121  40 126  42  40 168   2   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [None]:
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:]
    
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
decoder_final_output = to_categorical(padded_answers, VOCAB_SIZE)

print(decoder_final_output.shape)
print(tokenized_answers[0])
print(padded_answers[0])
print(decoder_final_output.shape)

(7792, 50, 5043)
[4, 287, 211, 16, 165, 157, 40, 121, 40, 126, 42, 40, 168, 2]
[  4 287 211  16 165 157  40 121  40 126  42  40 168   2   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
(7792, 50, 5043)


In [None]:
VOCAB_SIZE

5043

In [None]:
enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
_, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]

dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)

dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)
dec_dense = Dense(VOCAB_SIZE, activation='softmax')
output = dec_dense(dec_outputs)

In [None]:
model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 200)            1008600   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 200)            1008600   ['input_2[0][0]']             
                                                                                              

In [None]:
model.fit([encoder_inp, decoder_inp],
           decoder_final_output,
           batch_size=100,
           epochs=100)
model.save('sequence2sequence_lstm.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
import pickle
pickle.dump(model, open('chatbot.sav','wb'))

In [None]:
def make_inference_models():
    dec_state_input_h = Input(shape=(200,))
    dec_state_input_c = Input(shape=(200,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    dec_outputs, state_h, state_c = dec_lstm(dec_embedding,
                                            initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    dec_outputs = dec_dense(dec_outputs)
    dec_model = Model(
        inputs=[dec_inputs] + dec_states_inputs,
        outputs = [dec_outputs] + dec_states)
    print('Inference decoder:')
    dec_model.summary()
    print('Inference encoder:')
    enc_model = Model(inputs=enc_inputs, outputs=enc_states)
    enc_model.summary()
    return enc_model, dec_model

def str_to_tokens(sentence):
    #words = sentence.lower().split()
    words = clean_and_word_segmentation(sentence).split()
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word)
        if result != '':
            tokens_list.append(result)
    return pad_sequences([tokens_list], maxlen=maxlen_questions,padding='post')

enc_model, dec_model = make_inference_models()

Inference decoder:
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, None, 200)            1008600   ['input_2[0][0]']             
                                                                                                  
 input_5 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 200)]                0         []                            
                                                                         

In [None]:
def chatbot(input):
    print('Bot: Xin chào!')
    
    while True:
        input_question = input
    
        if input_question == '':
            print('Bot answer: bye')
            break
        states_values = enc_model.predict(str_to_tokens(input_question))
        empty_target_seq = np.zeros((1,1))
        empty_target_seq[0,0] = tokenizer.word_index['<START>']
        stop_condition = False
        decoded_translation = ''
        while not stop_condition:
            dec_outputs, h, c = dec_model.predict([empty_target_seq]+states_values)
            sampled_word_index = np.argmax(dec_outputs[0,-1, :])
            sampled_word = None
            for word, index in tokenizer.word_index.items():
                if sampled_word_index == index:
                    if word != '<END>':
                        decoded_translation += f'{word} '
                    sampled_word = word

            if sampled_word == '<END>' or len(decoded_translation.split()) > maxlen_answers:
                stop_condition = True
            empty_target_seq = np.zeros((1,1))
            empty_target_seq[0,0] = sampled_word_index
            states_values = [h,c]
        print('User: ',input_question)
        print('Bot answer:', decoded_translation, '\n')

In [None]:
# chat with bot
chatbot()

Bot: Xin chào!
User:  tên gì
Bot answer: hùng  

User:  có học đại học không
Bot answer: có  

User:  có ở kí túc xá trường không?
Bot answer: có  

User:  trường ở đâu
Bot answer: quận 7  

User:  học mấy năm rồi?
Bot answer: 2 năm  

User:  cao nhiêu?
Bot answer: 1 m7  

Bot answer: bye


In [None]:
enc_model.save("enc_model.h5")
dec_model.save("dec_model.h5")



In [None]:
# enc_model = keras.models.load_model("./enc_model.h5")
# dec_model =  keras.models.load_model("./dec_model.h5")

In [None]:
pickle.dump(tokenizer.word_index, open("tokenizer_word_index.sav", "wb"))

In [None]:
pickle.dump(maxlen_answers, open("maxlen_answers.sav", "wb"))