# Module

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Dataset

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv')
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [None]:
df.shape

(11823, 3)

In [None]:
df.isnull().sum()

Q        0
A        0
label    0
dtype: int64

# Data preprocessing

In [None]:
texts = []
pairs = []

for i in range(len(df)):
  texts.append(df.iloc[i,0])
  pairs.append(df.iloc[i,1])

In [None]:
# 특수 문자 제거
def clean_sentence(sentence):
    # 한글, 숫자를 제외한 모든 문자는 제거합니다.
    sentence = re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]',r'', sentence)
    return sentence

In [None]:
!pip install konlpy



In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
# 형태소 반환
def process_morph(sentence):
    return ' '.join(okt.morphs(sentence))

In [None]:
def clean_and_morph(sentence, is_question=True):
    # 한글 문장 전처리
    sentence = clean_sentence(sentence)
    # 형태소 변환
    sentence = process_morph(sentence)
    # Question 인 경우, Answer인 경우를 분기하여 처리합니다.
    if is_question:
        return sentence
    else:
        # START 토큰은 decoder input에 END 토큰은 decoder output에 추가합니다.
        return ('<START> ' + sentence, sentence + ' <END>')

In [None]:
def preprocess(texts, pairs):
    questions = []
    answer_in = []
    answer_out = []

    # 질의에 대한 전처리
    for text in texts:
        # 전처리와 morph 수행
        question = clean_and_morph(text, is_question=True)
        questions.append(question)

    # 답변에 대한 전처리
    for pair in pairs:
        # 전처리와 morph 수행
        in_, out_ = clean_and_morph(pair, is_question=False)
        answer_in.append(in_)
        answer_out.append(out_)

    return questions, answer_in, answer_out

In [None]:
questions, answer_in, answer_out = preprocess(texts, pairs)

In [None]:
print(questions[:5])

['12시 땡', '1 지망 학교 떨어졌어', '3 박 4일 놀러 가고 싶다', '3 박 4일 정도 놀러 가고 싶다', '심하네']


In [None]:
print(answer_in[:5])

['<START> 하루 가 또 가네요', '<START> 위로 해 드립니다', '<START> 여행 은 언제나 좋죠', '<START> 여행 은 언제나 좋죠', '<START> 눈살 이 찌푸려지죠']


In [None]:
print(answer_out[:5])

['하루 가 또 가네요 <END>', '위로 해 드립니다 <END>', '여행 은 언제나 좋죠 <END>', '여행 은 언제나 좋죠 <END>', '눈살 이 찌푸려지죠 <END>']


In [None]:
all_sentences = questions + answer_in + answer_out

In [None]:
# Tokenizer
tokenizer = Tokenizer(filters='', lower=False, oov_token='<OOV>')

In [None]:
tokenizer.fit_on_texts(all_sentences)

In [None]:
for word, idx in tokenizer.word_index.items():
    print(f'{word}\t\t => \t{idx}')
    if idx > 10:
        break

<OOV>		 => 	1
<START>		 => 	2
<END>		 => 	3
이		 => 	4
을		 => 	5
거		 => 	6
가		 => 	7
예요		 => 	8
사람		 => 	9
요		 => 	10
에		 => 	11


In [None]:
len(tokenizer.word_index)

12637

In [None]:
question_sequence = tokenizer.texts_to_sequences(questions)
answer_in_sequence = tokenizer.texts_to_sequences(answer_in)
answer_out_sequence = tokenizer.texts_to_sequences(answer_out)

In [None]:
question_padded = pad_sequences(question_sequence, maxlen=30, truncating='post', padding='post')
answer_in_padded = pad_sequences(answer_in_sequence, maxlen=30, truncating='post', padding='post')
answer_out_padded = pad_sequences(answer_out_sequence, maxlen=30, truncating='post', padding='post')

In [None]:
question_padded.shape

(11823, 30)

In [None]:
print('인코더의 입력의 크기(shape) :',question_padded.shape)
print('디코더의 입력의 크기(shape) :',answer_in_padded.shape)
print('디코더의 레이블의 크기(shape) :',answer_out_padded.shape)

인코더의 입력의 크기(shape) : (11823, 30)
디코더의 입력의 크기(shape) : (11823, 30)
디코더의 레이블의 크기(shape) : (11823, 30)


In [None]:
vocab_size = len(tokenizer.word_index) + 1
print("단어 집합의 크기 : ",vocab_size)

단어 집합의 크기 :  12638


In [None]:
word_to_index = tokenizer.word_index
index_to_word = tokenizer.index_word

In [None]:
index_to_word[1]

'<OOV>'

# 모형

In [None]:
VOCAB_SIZE = len(tokenizer.word_index)+1

## Encoder

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=time_steps, name='Embedding')
        self.dropout = tf.keras.layers.Dropout(0.2, name='Dropout')
        # (attention) return_sequences=True 추가
        self.lstm = tf.keras.layers.LSTM(units, return_state=True, return_sequences=True, name='LSTM')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x, hidden_state, cell_state = self.lstm(x)
        # (attention) x return 추가
        return x, [hidden_state, cell_state]

## Decoder

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=time_steps, name='Embedding')
        self.dropout = tf.keras.layers.Dropout(0.2, name='Dropout')
        self.lstm = tf.keras.layers.LSTM(units,
                         return_state=True,
                         return_sequences=True,
                         name='LSTM'
                        )
        self.attention = tf.keras.layers.Attention(name='Attention')
        self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax', name='Dense')

    def call(self, inputs, initial_state):
        # (attention) encoder_inputs 추가
        encoder_inputs, decoder_inputs = inputs
        x = self.embedding(decoder_inputs)
        x = self.dropout(x)
        x, hidden_state, cell_state = self.lstm(x, initial_state=initial_state)

        # (attention) key_value, attention_matrix 추가
        # 이전 hidden_state의 값을 concat으로 만들어 vector를 생성합니다.
        key_value = tf.concat([initial_state[0][:, tf.newaxis, :], x[:, :-1, :]], axis=1)
        # 이전 hidden_state의 값을 concat으로 만든 vector와 encoder에서 나온 출력 값들로 attention을 구합니다.
        attention_matrix = self.attention([key_value, encoder_inputs])
        # 위에서 구한 attention_matrix와 decoder의 출력 값을 concat 합니다.
        x = tf.concat([x, attention_matrix], axis=-1)

        x = self.dense(x)
        return x, hidden_state, cell_state

In [None]:
class Seq2Seq(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps, start_token, end_token):
        super(Seq2Seq, self).__init__()
        self.start_token = start_token
        self.end_token = end_token
        self.time_steps = time_steps

        self.encoder = Encoder(units, vocab_size, embedding_dim, time_steps)
        self.decoder = Decoder(units, vocab_size, embedding_dim, time_steps)


    def call(self, inputs,training=False,mask=None):
        if training:
            encoder_inputs, decoder_inputs = inputs
            # (attention) encoder 출력 값 수정
            encoder_outputs, context_vector = self.encoder(encoder_inputs)
            # (attention) decoder 입력 값 수정
            decoder_outputs, _, _ = self.decoder((encoder_outputs, decoder_inputs), initial_state=context_vector)
            return decoder_outputs
        else:
            x = inputs
            # (attention) encoder 출력 값 수정
            encoder_outputs, context_vector = self.encoder(x)
            target_seq = tf.constant([[self.start_token]], dtype=tf.float32)
            results = tf.TensorArray(tf.int32, self.time_steps)

            for i in tf.range(self.time_steps):
                decoder_output, decoder_hidden, decoder_cell = self.decoder((encoder_outputs, target_seq), initial_state=context_vector)
                decoder_output = tf.cast(tf.argmax(decoder_output, axis=-1), dtype=tf.int32)
                decoder_output = tf.reshape(decoder_output, shape=(1, 1))
                results = results.write(i, decoder_output)

                if decoder_output == self.end_token:
                    break

                target_seq = decoder_output
                context_vector = [decoder_hidden, decoder_cell]

            return tf.reshape(results.stack(), shape=(1, self.time_steps))

## 변환된 index를 다시 단어로 변환

In [None]:
def convert_index_to_text(indexs, end_token):

    sentence = ''

    # 모든 문장에 대해서 반복
    for index in indexs:
        if index == end_token:
            # 끝 단어이므로 예측 중비
            break;
        # 사전에 존재하는 단어의 경우 단어 추가
        if index > 0 and tokenizer.index_word[index] is not None:
            sentence += tokenizer.index_word[index]
        else:
        # 사전에 없는 인덱스면 빈 문자열 추가
            sentence += ''

        # 빈칸 추가
        sentence += ' '
    return sentence

# 학습

In [None]:
EMBEDDING_DIM = 100
TIME_STEPS = 30
START_TOKEN = tokenizer.word_index['<START>']
END_TOKEN = tokenizer.word_index['<END>']

UNITS = 128

VOCAB_SIZE = len(tokenizer.word_index)+1
DATA_LENGTH = len(questions)
SAMPLE_SIZE = 3

In [None]:
model = Seq2Seq(UNITS, VOCAB_SIZE, EMBEDDING_DIM, TIME_STEPS, START_TOKEN, END_TOKEN)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='loss',patience=3,mode='min',verbose=1)
checkpoint_path = 'training_checkpoint.h5'
mc = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                             save_weights_only=True,
                             save_best_only=True,
                             monitor='loss',
                             verbose=1
                            )

In [None]:
model.fit([question_padded, answer_in_padded],answer_out_padded,epochs=350,batch_size=64, callbacks=[mc,es])

Epoch 1/350
Epoch 1: loss improved from inf to 2.18599, saving model to training_checkpoint.ckpt
Epoch 2/350
Epoch 2: loss improved from 2.18599 to 1.28227, saving model to training_checkpoint.ckpt
Epoch 3/350
Epoch 3: loss improved from 1.28227 to 1.21081, saving model to training_checkpoint.ckpt
Epoch 4/350
Epoch 4: loss improved from 1.21081 to 1.16675, saving model to training_checkpoint.ckpt
Epoch 5/350
Epoch 5: loss improved from 1.16675 to 1.12749, saving model to training_checkpoint.ckpt
Epoch 6/350
Epoch 6: loss improved from 1.12749 to 1.08598, saving model to training_checkpoint.ckpt
Epoch 7/350
Epoch 7: loss improved from 1.08598 to 1.04612, saving model to training_checkpoint.ckpt
Epoch 8/350
Epoch 8: loss improved from 1.04612 to 1.00993, saving model to training_checkpoint.ckpt
Epoch 9/350
Epoch 9: loss improved from 1.00993 to 0.97732, saving model to training_checkpoint.ckpt
Epoch 10/350
Epoch 10: loss improved from 0.97732 to 0.94728, saving model to training_checkpoi

<keras.callbacks.History at 0x7fc708daf730>

In [None]:
def make_prediction(model, question_inputs):
    results = model(inputs=question_inputs, training=False)
    # 변환된 인덱스를 문장으로 변환
    results = np.asarray(results).reshape(-1)
    return results

In [None]:
# 자연어 (질문 입력) 대한 전처리 함수
def make_question(sentence):
    sentence = clean_and_morph(sentence)
    question_sequence = tokenizer.texts_to_sequences([sentence])
    question_padded = pad_sequences(question_sequence, maxlen=30, truncating='post', padding='post')
    return question_padded

In [None]:
def run_chatbot(question):
    question_inputs = make_question(question)
    results = make_prediction(model, question_inputs)
    results = convert_index_to_text(results, END_TOKEN)
    return results

In [None]:
while True:
    user_input = input('<< 말을 걸어 보세요!\n')
    if user_input == 'q':
        break
    print('>> 챗봇 응답: {}'.format(run_chatbot(user_input)))

<< 말을 걸어 보세요!
커피를 마시고 싶습니다
>> 챗봇 응답: 카페인 이 필요한 시간 인가 봐요 
<< 말을 걸어 보세요!
q


In [None]:
model.save_weights('training_checkpoint.h5')

In [None]:
model.load_weights('training_checkpoint.h5')

In [None]:
!nvidia-smi

Thu Jul 20 14:20:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    40W / 300W |   2990MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces