<a href="https://colab.research.google.com/github/nureeee/DeepLearning/blob/main/Seq2Seq_%EA%B8%B0%EB%B3%B8_%EA%B5%AC%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from tensorflow.keras.layers import LSTM

In [2]:
sample_train = np.random.randn(1, 4, 5)  # N * L * I 

In [3]:
last_hidden_state = LSTM(3, return_sequences=False, return_state=False)(sample_train)
print(last_hidden_state)

tf.Tensor([[-0.06553418  0.05300762 -0.00716654]], shape=(1, 3), dtype=float32)


In [4]:
hidden_states, last_hidden_state, last_cell_state = LSTM(3, return_sequences=False, return_state=True)(sample_train)
print('hidden_states : {}'.format(hidden_states))
print('last_hidden_state : {}'.format(last_hidden_state))
print('last_cell_state : {}'.format(last_cell_state))

hidden_states : [[ 0.13284302 -0.04286626  0.06677939]]
last_hidden_state : [[ 0.13284302 -0.04286626  0.06677939]]
last_cell_state : [[ 0.22402963 -0.11064757  0.15431193]]


In [5]:
hidden_states = LSTM(3, return_sequences=True, return_state=False)(sample_train)
print('hidden_states : {} / shape : {}'.format(hidden_states, hidden_states.shape))

hidden_states : [[[ 0.01692073 -0.02748816 -0.4977551 ]
  [ 0.17771466  0.07629572 -0.31636286]
  [-0.12964407  0.06626214 -0.07307925]
  [-0.06907882 -0.13199747 -0.173571  ]]] / shape : (1, 4, 3)


In [3]:
! pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 8.7 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 56.3 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.9 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [4]:
import random # 나중에 데이터 셔플링 할 예정
import tensorflow as tf
from konlpy.tag import Okt

In [5]:
num_epochs = 200
vocab_size = 2000

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self):
        super(Encoder, self).__init__()
        
        self.emb = tf.keras.layers.Embedding(vocab_size, 64) # 임베딩 사이즈가 64??
        self.lstm = tf.keras.layers.LSTM(512, return_sequences=False, return_state=True)
        
    def call(self, x, training=False):
        x = self.emb(x)

        _, h, c = self.lstm(x)
        
        return h, c


In [8]:
class Decoder(tf.keras.Model):
    def __init__(self):
        super(Decoder, self).__init__()
        self.emb = tf.keras.layers.Embedding(vocab_size, 64)
        self.lstm = tf.keras.layers.LSTM(512, return_sequences=True, return_state=True)
        
        self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs, training=False):
        x, h, c = inputs
        x = self.emb(x)

        # y_ : 해당 시퀀스이 hidden_state 
        y_, h, c = self.lstm(x, initial_state=[h, c])

        y = self.dense(y_)
        
        return y, h, c

In [9]:
class Seq2seq(tf.keras.Model):
    
    def __init__(self, sos, eos):
        super(Seq2seq, self).__init__()
        self.sos = sos
        self.eos = eos

        self.enc = Encoder()
        self.dec = Decoder()

    def call(self, inputs, training=False):
        if training:
            x, y = inputs # x는 인코더 y는 디코더
            h, c = self.enc(x)
            # y.shape : (N, 64, 64)
            y, _, __ = self.dec((y, h, c))
            return y
        else:
            x = inputs
            h, c = self.enc(x)

            y = tf.convert_to_tensor(self.sos)
            y = tf.reshape(y, (1, 1))
            
            seq = tf.TensorArray(tf.int32, 64)

            for idx in tf.range(64):
                
                y, h, c = self.dec([y, h, c])
                y = tf.cast(tf.argmax(y, axis=-1), dtype=tf.int32)

                y = tf.reshape(y, (1, 1))

                seq = seq.write(idx, y)

                if y == self.eos:
                    break
                    
            return tf.reshape(seq.stack(), (1, 64))
                

In [10]:
@tf.function
def train_step(model, inputs, labels, loss_object, optimizer, train_loss, train_accuracy):
    output_labels = labels[:, 1:]
    shifted_labels = labels[:, :-1]

    with tf.GradientTape() as tape:
        predictions = model([inputs, shifted_labels], training=True)
        loss = loss_object(output_labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(output_labels, predictions)
    
@tf.function
def test_step(model, inputs):
    return model(inputs, training=False)


In [11]:
from konlpy.tag import Okt

dataset_file = "chatbot_data.csv"
okt = Okt()

In [20]:
with open(dataset_file, 'r') as file:
  lines = file.readlines()
  seq = [" ".join(okt.morphs(line)) for line in lines]

In [21]:
questions = seq[::2]
answers = ['\t ' + lines for lines in seq[1::2]]
print(questions[:3])
print(answers[:3])

['아이스 아메리카노 하나요 \n', '저 카푸치노 로 주문 할게요 \n', '저 도장 다 모았는데 나중 에 써도 되나요 ? \n']
['\t 테이크아웃 하실 건가 요 ? \n', '\t 시럽 은 얼마나 뿌려 드릴 까요 ? \n', '\t 네 다음 에 써도 됩니다 \n']


In [22]:
num_samples = len(questions)
print(num_samples)

500


In [23]:
term = list(range(num_samples))
random.seed(0)
random.shuffle(term)

print(term)

[419, 459, 130, 431, 370, 26, 201, 56, 366, 108, 231, 326, 118, 153, 493, 311, 333, 24, 367, 17, 150, 295, 247, 44, 274, 285, 481, 420, 164, 100, 199, 196, 405, 62, 452, 436, 415, 58, 393, 64, 492, 98, 188, 433, 307, 127, 404, 210, 277, 256, 116, 27, 343, 84, 134, 177, 109, 417, 67, 317, 490, 129, 68, 270, 1, 390, 438, 28, 451, 139, 355, 160, 168, 413, 264, 354, 339, 291, 137, 174, 306, 105, 332, 15, 225, 359, 469, 279, 391, 187, 429, 251, 289, 377, 496, 262, 192, 103, 142, 22, 383, 444, 167, 184, 292, 14, 136, 203, 331, 165, 237, 321, 131, 369, 288, 375, 157, 146, 450, 235, 125, 206, 497, 437, 265, 43, 205, 69, 38, 297, 2, 449, 219, 53, 499, 380, 480, 117, 259, 477, 93, 176, 202, 52, 211, 10, 4, 221, 39, 351, 88, 283, 254, 65, 356, 115, 70, 5, 352, 101, 121, 208, 345, 476, 77, 462, 486, 30, 185, 25, 491, 263, 275, 180, 54, 194, 80, 186, 173, 213, 3, 238, 21, 362, 353, 474, 495, 12, 119, 236, 83, 126, 9, 410, 193, 473, 255, 458, 135, 434, 224, 232, 371, 379, 220, 357, 302, 281, 457, 38

In [24]:
train_q = []
train_a = []

test_q = []
test_a = []

In [25]:
test_ratio = 0.2
test_cnt = int(len(questions) * test_ratio)

train_indices = term[test_cnt:]
test_indices = term[:test_cnt]

for idx in train_indices:
    train_q.append(questions[idx])
    train_a.append(answers[idx])

for idx in test_indices:
    test_q.append(questions[idx])
    test_a.append(answers[idx])


In [26]:
test_q[:3], test_a[:3]

(['사이 즈 업 해서 주세요 \n',
  '캐러멜 드리블 이랑 통 잡아 칩이요 \n',
  '시즌 메뉴 와 함께 구성 되어 있는 세트 메뉴 가 있나요 ? \n'],
 ['\t 네 결제 는 어떻게 도 와 드릴 까요 ? \n',
  '\t 6700원 결제 도 와 드리겠습니다 \n',
  '\t 네 치즈 케이크 와 시즌 메뉴 두 잔 으로 구성 된 세트 메뉴 있습니다 \n'])

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~')

In [28]:
tokenizer.fit_on_texts(train_q + train_a)
print(tokenizer.word_index)

{'\n': 1, '\t': 2, '네': 3, '주세요': 4, '로': 5, '아메리카노': 6, '는': 7, '에': 8, '아이스': 9, '도': 10, '요': 11, '잔': 12, '이': 13, '한': 14, '드릴': 15, '까요': 16, '은': 17, '입니다': 18, '사이즈': 19, '가': 20, '있나요': 21, '결제': 22, '수': 23, '하나': 24, '있습니다': 25, '와': 26, '드시고': 27, '해주세요': 28, '할게요': 29, '으로': 30, '라테': 31, '추가': 32, '따뜻한': 33, '주문': 34, '사용': 35, '음료': 36, '되나요': 37, '여기': 38, '아니요': 39, '거': 40, '얼마': 41, '개': 42, '그럼': 43, '카드': 44, '랑': 45, '드리겠습니다': 46, '케이크': 47, '어떤': 48, '걸': 49, '포인트': 50, '가시나요': 51, '한잔': 52, '할인': 53, '적립': 54, '다': 55, '커피': 56, '더': 57, '인가요': 58, '쿠폰': 59, '가요': 60, '드릴게요': 61, '티': 62, '건': 63, '가능합니다': 64, '알겠습니다': 65, '에서': 66, '가능한가요': 67, '매장': 68, '를': 69, '진동': 70, '면': 71, '벨': 72, '안': 73, '번호': 74, '만': 75, '에요': 76, '메뉴': 77, '하나요': 78, '디카': 79, '페인': 80, '건가': 81, '샷': 82, '있어요': 83, '됩니다': 84, '테이크아웃': 85, '예요': 86, '스무디': 87, '게': 88, '카페라테': 89, '두': 90, '같이': 91, '자몽': 92, '하고': 93, '치즈케이크': 94, '제일': 95, '뭐': 96, '카페모카': 97, '기프티콘': 98, '세트':

In [29]:
train_q_seq = tokenizer.texts_to_sequences(train_q)
train_a_seq = tokenizer.texts_to_sequences(train_a)

test_q_seq = tokenizer.texts_to_sequences(test_q)
test_a_seq = tokenizer.texts_to_sequences(test_a)

In [30]:
X_train = pad_sequences(
    train_q_seq,
    value=0,
    padding='pre',
    maxlen=64
)

y_train = pad_sequences(
    train_a_seq,
    value=0,
    padding='post',
    maxlen=65
)

X_test = pad_sequences(
    test_q_seq,
    value=0,
    padding='pre',
    maxlen=64
)

y_test = pad_sequences(
    test_a_seq,
    value=0,
    padding='post',
    maxlen=65
)

In [31]:
X_train[0], y_train[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 85, 12, 30,  4,  1], dtype=int32),
 array([  2, 627, 628, 629,  73,  66, 630, 631,  35, 113,  23, 378,   1,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       dtype=int32))

In [32]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1024).batch(32).prefetch(1024)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1).prefetch(1024)

In [33]:
model = Seq2seq(
    sos=tokenizer.word_index["\t"],
    eos=tokenizer.word_index["\n"]
)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [None]:
EPOCHS = 200
for epoch in range(EPOCHS):
  for seqs, labels in train_ds:
    train_step(model, seqs, labels, loss_object, optimizer, train_loss, train_accuracy)
  
  print("Epoch : {}, Loss : {:.3f}, Accuracy : {:.3f}".format(epoch + 1,
                                                      train_loss.result(),
                                                      train_accuracy.result() * 100))
  
  train_loss.reset_states()
  train_accuracy.reset_states()

In [None]:
for test_seq, test_labels in test_ds:
  prediction = test_step(model, test_seq)
  
  test_q = tokenizer.sequences_to_texts(test_seq.numpy())
  test_a = tokenizer.sequences_to_texts(test_labels.numpy())
  test_p = tokenizer.sequences_to_texts(prediction.numpy())

  print("______")
  print("질문 : \t{}".format(test_q))
  print("실제 대답 : {}".format(test_a))
  print("챗봇 대답 : {}".format(test_p))
