In [1]:
#https://github.com/Pawandeep-prog/chatbot

import pandas as pd
import os
import glob
import numpy as np
from keras.layers import Input,Embedding,LSTM,Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Flatten
from tensorflow.keras import layers , activations , models , preprocessing , utils
import tensorflow as tf
from  sklearn.model_selection import train_test_split
import re

In [2]:
path = os.getcwd()
datapath = glob.glob(os.path.join(path,"data/*.txt"))
fulldata = []
for i in datapath:
    with open(i,'r',encoding='utf-8') as f:
#     lines = [i.strip() for i in f.readlines()]
        lines = f.readlines()
    for j in lines:
        fulldata.append(j)
print(fulldata[:5])

['Thích đánh lộn không? __eou__ Ngon nhà vô __eou__\n', 'Solo yasua không __eou__ Chấp lun 2 mạng đầu __eou__\n', 'Mai đi picnic không? __eou__ Mai bận học rồi __eou__\n', 'Mai học ca mấy vậy? __eou__ Mai học ca 3 __eou__\n', 'Còn tiền không? __eou__ Còn chết liền __eou__\n']


In [3]:
data = [i.split('__eou__') for i in fulldata]
data_question = [i[0] for i in data]
data_answer = [i[1] for i in data]
print(len(data_question))
print(data_answer[:5])

5855
[' Ngon nhà vô ', ' Chấp lun 2 mạng đầu ', ' Mai bận học rồi ', ' Mai học ca 3 ', ' Còn chết liền ']


In [4]:
def clean_text(text):
  text_ = text.lower()
  text_ = re.sub(r'([^\s\w]|_)+', '', text_)
  return text_.strip()

In [5]:
#Làm sạch text
for i in range(len(data_question)):
  data_question[i] =clean_text(data_question[i])

for i in range(len(data_answer)):
  data_answer[i] = clean_text(data_answer[i])

In [6]:
# đếm số lần xuất hiện của 1 từ trong data_question và data_answer
word2count = {}

for i in data_question:
  for word in i.split():
    if word not in word2count:
      word2count[word] = 1
    else:
      word2count[word] +=1

for i in data_answer:
  for word in i.split():
    if word not in word2count:
      word2count[word] = 1
    else:
      word2count[word] +=1


In [7]:
vocab = {}
word_index = 0

for word, count in word2count.items():
    vocab[word] = word_index
    word_index += 1

In [8]:
# Gắn <begin> và <end>
for i in range(len(data_answer)):
  data_answer[i] = '<begin> ' + data_answer[i] + ' <end>'
print(data_answer[:5])

['<begin> ngon nhà vô <end>', '<begin> chấp lun 2 mạng đầu <end>', '<begin> mai bận học rồi <end>', '<begin> mai học ca 3 <end>', '<begin> còn chết liền <end>']


In [9]:
tokens = ['<pad>','<end>','<begin>','<out>']
x = len(vocab)
for token in tokens:
    vocab[token] = x
    x += 1
    
vocab['ạ'] = vocab['<pad>']
vocab['<pad>'] = 0

In [10]:
inv_vocab = {w:v for v,w in vocab.items()}

In [11]:
encoder_inp = []
for i in data_question:
  lst = []
  for word in i.split():
    if word not in vocab:
      lst.append(vocab['<out>'])
    else:
      lst.append(vocab[word])
  encoder_inp.append(lst)

encoder_inp = pad_sequences(encoder_inp,13, padding= 'post', truncating= 'post')
encoder_inp.shape

(5855, 13)

In [12]:
decoder_inp = []
for i in data_answer:
  lst = []
  for word in i.split():
    if word not in vocab:
      lst.append(vocab['<out>'])
    else:
      lst.append(vocab[word])
  decoder_inp.append(lst)

decoder_inp = pad_sequences(decoder_inp,13, padding= 'post', truncating= 'post')
decoder_inp.shape

(5855, 13)

In [13]:
decoder_final = []
for i in decoder_inp:
    decoder_final.append(i[1:]) 

decoder_final = pad_sequences(decoder_final,13, padding= 'post', truncating= 'post')
print(decoder_final.shape)

from keras.utils import to_categorical
decoder_final= to_categorical(decoder_final, len(vocab))
print(decoder_final.shape)

(5855, 13)
(5855, 13, 3549)


In [14]:
from keras.models import Model,Sequential
from keras.layers.wrappers import Bidirectional
from keras.layers import Dense,Conv1D,MaxPooling1D,Flatten,Embedding

enc_inp = Input(shape = (13,))
dec_inp = Input(shape = (13,))
vocab_size=len(vocab)

model = Sequential()
embed = Embedding(vocab_size+1,output_dim=50,input_length = 13,trainable=True)

enc_embed = embed(enc_inp)
enc_lstm = LSTM(400,return_sequences=True,return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]

dec_embed = embed(dec_inp)
dec_lstm = LSTM(400, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

dense = Dense(vocab_size, activation='softmax')

dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 13)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 13)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 13, 50)       177500      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 13, 400), (N 721600      embedding[0][0]       

In [15]:
model.fit([encoder_inp, decoder_inp],decoder_final,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x16e8de23af0>

In [32]:
## Seq2Seq
#Encoder model
enc_model = Model([enc_inp], enc_states)

#Decoder model
decoder_state_input_h = Input(shape=(400, ))
decoder_state_input_c = Input(shape=(400, ))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = dec_lstm(dec_embed, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

dec_model = Model([dec_inp] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [82]:
inv_vocab = {w:v for v,w in vocab.items()}
prepro1 = ""
while prepro1 != 'bye':
    prepro1 = str(input("You: "))

    prepro1 = clean_text(prepro1)

    prepro = [prepro1]

    txt = []

    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(vocab[y])
            except:
                lst.append(vocab['<out>'])
    txt.append(lst)

    txt = pad_sequences(txt, 13, padding= 'post')

    stat = enc_model.predict(txt)

    empty_target_seq = np.zeros( (1,1) )
    empty_target_seq[0, 0] = vocab['<begin>']

    stop_condition = False
    decoded_translation = ''

    while not stop_condition:
        dec_outputs, h, c = dec_model.predict([empty_target_seq]+ stat)

        decoder_concat_input = dense(dec_outputs)

        sampled_word_index = np.argmax(decoder_concat_input[0,-1,:])

        sampled_word = inv_vocab[sampled_word_index] + ' '

        if sampled_word != '<end> ':
            decoded_translation += sampled_word
    
        if sampled_word == '<end> ' or len(decoded_translation.split()) > 15:
            stop_condition = True

        empty_target_seq = np.zeros( (1, 1) )
        empty_target_seq[0, 0] = sampled_word_index

        stat = [h,c]

    print("Bot: ", decoded_translation)
    print("=====================================")

You: bạn học gì
Bot:  minh hoc công nghê thông tin 
You: bạn học ở đâu
Bot:  minh hoc công nghê thông tin 
You: crush ai khôn
Bot:  có chứ 
You: cậu có thể nói gì đó tốt hơn không
Bot:  dạ còn ạ 
You: đi đâu đấy
Bot:  đi về nhà 
You: có thích đi du lịch không
Bot:  ok anh 
You: bạn thích đi du lịch ở đâu
Bot:  minh thich lord of the ring 
You: bye
Bot:  đi về nhà 
