# chatbot

In [1]:
import os
import re
import random
import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from konlpy.tag import Mecab
from tqdm import tqdm

<br><br><br><br>

## 1. 데이터 다운로드
[songys/Chatbot_data](https://github.com/songys/Chatbot_data)

In [2]:
csv_path = os.getenv('HOME') + '/aiffel/gd_12_transformer_chatbot/data/ChatbotData.csv'
csv = pd.read_csv(csv_path)
csv.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [3]:
questions = csv['Q']
answers = csv['A']

In [4]:
questions

0                         12시 땡!
1                    1지망 학교 떨어졌어
2                   3박4일 놀러가고 싶다
3                3박4일 정도 놀러가고 싶다
4                        PPL 심하네
                  ...           
11818             훔쳐보는 것도 눈치 보임.
11819             훔쳐보는 것도 눈치 보임.
11820                흑기사 해주는 짝남.
11821    힘든 연애 좋은 연애라는게 무슨 차이일까?
11822                 힘들어서 결혼할까봐
Name: Q, Length: 11823, dtype: object

<br><br><br><br>

## 2. 데이터 정제

In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^a-zA-Z가-힣0-9?.!,]+", " ", sentence)
    return sentence

<br><br><br><br>

## 3. 데이터 토큰화

In [6]:
mecab = Mecab()

In [7]:
def build_corpus(src_data, tgt_data):
    mecab_src_corpus, mecab_tgt_corpus = [], []
    mecab_src_len_list, mecab_tgt_len_list = [], []
    
    for s, t in zip(src_data, tgt_data):
        s = mecab.morphs(preprocess_sentence(s))
        t = mecab.morphs(preprocess_sentence(t))
        
        mecab_src_corpus.append(s)
        mecab_tgt_corpus.append(t)
        
        mecab_src_len_list.append(len(s))
        mecab_tgt_len_list.append(len(t))

    mecab_num_tokens = mecab_src_len_list + mecab_tgt_len_list
    
    mean_len = np.mean(mecab_num_tokens)
    max_len = np.max(mecab_num_tokens)
    mid_len = np.median([mean_len, max_len])
    print(f'mid_len : {mid_len}')
    
    src_corpus, tgt_corpus = [], []
    for q, a in zip(mecab_src_corpus, mecab_tgt_corpus):
        if len(q) <= mid_len and len(a) <= mid_len:
            if q not in src_corpus and a not in tgt_corpus:
                src_corpus.append(q)
                tgt_corpus.append(a)
    
    return src_corpus, tgt_corpus

In [8]:
que_corpus, ans_corpus = build_corpus(questions, answers)

mid_len : 23.849149961938593


In [9]:
que_corpus[:5]

[['12', '시', '땡', '!'],
 ['1', '지망', '학교', '떨어졌', '어'],
 ['3', '박', '4', '일', '놀', '러', '가', '고', '싶', '다'],
 ['ppl', '심하', '네'],
 ['sd', '카드', '망가졌', '어']]

In [10]:
ans_corpus[:5]

[['하루', '가', '또', '가', '네요', '.'],
 ['위로', '해', '드립니다', '.'],
 ['여행', '은', '언제나', '좋', '죠', '.'],
 ['눈살', '이', '찌푸려', '지', '죠', '.'],
 ['다시', '새로', '사', '는', '게', '마음', '편해요', '.']]

In [11]:
len(que_corpus), len(ans_corpus)

(7637, 7637)

<br><br><br><br>

## 4. Augmentation
[Kyubyong/wordvectors - korean(w)](https://github.com/Kyubyong/wordvectors)

In [12]:
word2vec_path = os.getenv('HOME') + '/aiffel/gd_12_transformer_chatbot/data/ko/ko.bin'
word2vec = gensim.models.Word2Vec.load(word2vec_path)

In [13]:
def lexical_sub(sentence, word2vec):
    try:
        _from = random.choice(sentence)
        _to = word2vec.most_similar(_from)[0][0]
    except:
        return sentence
    
    res = []
    for x in sentence:
        if x is _from: res.append(_to)
        else: res.append(x)

    return res

In [14]:
arg_que_corpus = [lexical_sub(x, word2vec) for x in que_corpus]
arg_ans_corpus = [lexical_sub(x, word2vec) for x in ans_corpus]

  after removing the cwd from sys.path.


In [15]:
for i in range(5):
    print(f"Q : {' '.join(que_corpus[i])} / {' '.join(arg_que_corpus[i])}")
    print(f"A : {' '.join(ans_corpus[i])} / {' '.join(arg_ans_corpus[i])}")

Q : 12 시 땡 ! / 12 시 땡 캐치
A : 하루 가 또 가 네요 . / 하루 가 또 가 네요 는데
Q : 1 지망 학교 떨어졌 어 / 1 지망 학교 떨어졌 어
A : 위로 해 드립니다 . / 위로 해 드립니다 는데
Q : 3 박 4 일 놀 러 가 고 싶 다 / 3 박 4 일 살 러 가 고 싶 다
A : 여행 은 언제나 좋 죠 . / 여행 은 언제나 괜찮 죠 .
Q : ppl 심하 네 / ppl 강하 네
A : 눈살 이 찌푸려 지 죠 . / 눈살 이 찌푸려 지 죠 .
Q : sd 카드 망가졌 어 / sd 카드 망가졌 어서
A : 다시 새로 사 는 게 마음 편해요 . / 다시 새로 사 는 게 괴로움 편해요 .


In [16]:
que_corpus = que_corpus + arg_que_corpus + que_corpus
ans_corpus = ans_corpus + ans_corpus + arg_ans_corpus

In [17]:
len(que_corpus), len(ans_corpus)

(22911, 22911)

<br><br><br><br>

## 5. 데이터 벡터화

In [18]:
#  타겟 데이터 전체에 <start> 토큰과 <end> 토큰을 추가
ans_corpus = [["<start>"] + ans + ["<end>"] for ans in ans_corpus]
ans_corpus[:3]

[['<start>', '하루', '가', '또', '가', '네요', '.', '<end>'],
 ['<start>', '위로', '해', '드립니다', '.', '<end>'],
 ['<start>', '여행', '은', '언제나', '좋', '죠', '.', '<end>']]

In [19]:
# 전체 데이터에 대한 단어사전 구축 및 벡터화
data = que_corpus + ans_corpus

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None, filters=' ', oov_token='<unk>')
tokenizer.fit_on_texts(data)
tensor = tokenizer.texts_to_sequences(data)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

In [20]:
VOCAB_SIZE = len(tokenizer.index_word) + 2
VOCAB_SIZE

7125

In [21]:
# 단어사전 확인
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    if idx >= 5: break

1 : <unk>
2 : .
3 : <start>
4 : <end>
5 : 이


In [22]:
print(tensor.shape, len(data))

(45822, 25) 45822


In [23]:
enc_train, dec_train = tensor[:22911], tensor[22911:]

In [24]:
enc_train.shape, dec_train.shape

((22911, 25), (22911, 25))

<br><br><br><br>

## 6. 훈련하기

### 6-1. Positional Encoding

In [25]:
# pos - 단어가 위치한 Time-step(각각의 토큰의 위치정보값이며 정수값을 의미)
# d_model - 모델의 Embedding 차원 수
# i - Encoding차원의 index

def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i)/d_model)  # np.power(a,b) > a^b(제곱)
    
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    
    # 배열의 짝수 인덱스(2i)에는 사인 함수 적용
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    # 배열의 홀수 인덱스(2i+1)에는 코사인 함수 적용
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    
    return sinusoid_table

### 3-2. Multi-Head Attention

In [26]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)  # Linear Layer
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        
        # Scaled QK 값 구하기
        QK = tf.matmul(Q, K, transpose_b=True)
        scaled_qk = QK / tf.math.sqrt(d_k)
        
        if mask is not None:
            scaled_qk += (mask * -1e9)
        
        # 1. Attention Weights 값 구하기 -> attentions
        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        # 2. Attention 값을 V에 곱하기 -> out
        out = tf.matmul(attentions, V)
        return out, attentions
    
    def split_heads(self, x):
        """
        Embedding된 입력을 head의 수로 분할하는 함수
        
        x: [ batch x length x emb ]
        return: [ batch x length x heads x self.depth ]
        """
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])
        return split_x
    
    def combine_heads(self, x):
        """
        분할된 Embedding을 하나로 결합하는 함수
        
        x: [ batch x length x heads x self.depth ]
        return: [ batch x length x emb ]
        """
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))
        return combined_x
    
    def call(self, Q, K, V, mask):
        """
        Step 1: Linear_in(Q, K, V) -> WQ, WK, WV
        Step 2: Split Heads(WQ, WK, WV) -> WQ_split, WK_split, WV_split
        Step 3: Scaled Dot Product Attention(WQ_split, WK_split, WV_split)
                 -> out, attention_weights
        Step 4: Combine Heads(out) -> out
        Step 5: Linear_out(out) -> out
        """
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask
        )
        
        out = self.combine_heads(out)
        out = self.linear(out)
        
        return out, attention_weights

### 3-3. Position-wise Feed-Forward Network

In [27]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.w_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w_2 = tf.keras.layers.Dense(d_model)
        
    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
        return out

### 3-4. Encoder Layer

In [28]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        # Multi-Head Attention
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        # Position-Wise Feed Forward Network
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

### 3-5. Decoder Layer

In [29]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        # Masked Multi-Head Attention
        residual = x
        out = self.norm_1(x)
        #out, dec_attn = self.dec_self_attn(out, out, out, causality_mask)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual
        
        # Multi-Head Attention
        residual = out
        out = self.norm_2(out)
        #out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, padding_mask)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, causality_mask)
        out = self.do(out)
        out += residual

        # Position-Wise Feed Forward Network
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

### 3-6. Encoder

In [30]:
class Encoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
            
        return out, enc_attns

### 3-7. Decoder

In [31]:
class Decoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = self.dec_layers[i](out, enc_out, causality_mask, padding_mask)
            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)
        
        return out, dec_attns, dec_enc_attns

### 3-8. Transformer

In [69]:
class Transformer(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, src_vocab_size, tgt_vocab_size,
                 pos_len, dropout=0.2, shared=True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        
        # 1. Embedding Layer 정의
        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)
        
        # 2. Positional Encoding 정의
        self.pos_encoding = positional_encoding(pos_len, d_model)
        # 6. Dropout 정의
        self.do = tf.keras.layers.Dropout(dropout)
        
        # 3. Encoder / Decoder 정의
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)
        
        # 4. Output Linear 정의
        self.fc = tf.keras.layers.Dense(tgt_vocab_size)
        
        # 5. Shared Weights
        self.shared = shared
        
        if shared:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))
        
        
    def embedding(self, emb, x):
        """
        입력된 정수 배열을 Embedding + Pos Encoding
        + Shared일 경우 Scaling 작업 포함

        x: [ batch x length ]
        return: [ batch x length x emb ]
        """
        seq_len = x.shape[1]
        out = emb(x)
        
        if self.shared:
            out *= tf.math.sqrt(self.d_model)
        
        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)
        
        return out
    
    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        # Step 1: Embedding(enc_in, dec_in) -> enc_in, dec_in
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)
        # Step 2: Encoder(enc_in, enc_mask) -> enc_out, enc_attns
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        # Step 3: Decoder(dec_in, enc_out, mask) -> dec_out, dec_attns, dec_enc_attns
        dec_out, dec_attns, dec_enc_attns = self.decoder(dec_in, enc_out, causality_mask, dec_mask)
        # Step 4: Out Linear(dec_out) -> logits
        logits = self.fc(dec_out)
        return logits, enc_attns, dec_attns, dec_enc_attns

### 3-9. Masking

In [33]:
# Attention을 할 때에 <PAD> 토큰에도 Attention을 주는 것을 방지해 주는 역할
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [34]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [35]:
learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [36]:
# Loss 함수 정의
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    # Masking 되지 않은 입력의 개수로 Scaling하는 과정
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [37]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    # 계산된 loss에 tf.GradientTape()를 적용해 학습을 진행합니다.
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])

    # 최종적으로 optimizer.apply_gradients()가 사용됩니다. 
    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss, enc_attns, dec_attns, dec_enc_attns

In [39]:
transformer1 = Transformer(
    n_layers=6,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared=True
)

EPOCHS = 20
BATCH_SIZE = 64

for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = train_step(enc_train[idx:idx+BATCH_SIZE],
                                                                     dec_train[idx:idx+BATCH_SIZE],
                                                                     transformer1,
                                                                     optimizer)

        total_loss += batch_loss

        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

Epoch  1: 100%|██████████| 358/358 [01:10<00:00,  5.08it/s, Loss 5.9654] 
Epoch  2: 100%|██████████| 358/358 [00:55<00:00,  6.50it/s, Loss 3.5200]
Epoch  3: 100%|██████████| 358/358 [00:55<00:00,  6.45it/s, Loss 2.0371]
Epoch  4: 100%|██████████| 358/358 [00:55<00:00,  6.42it/s, Loss 1.1068]
Epoch  5: 100%|██████████| 358/358 [00:55<00:00,  6.40it/s, Loss 0.8367]
Epoch  6: 100%|██████████| 358/358 [00:56<00:00,  6.38it/s, Loss 0.7352]
Epoch  7: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.6916]
Epoch  8: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.6709]
Epoch  9: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.6439]
Epoch 10: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.6494]
Epoch 11: 100%|██████████| 358/358 [00:56<00:00,  6.35it/s, Loss 0.6552]
Epoch 12: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.6262]
Epoch 13: 100%|██████████| 358/358 [00:56<00:00,  6.37it/s, Loss 0.5617]
Epoch 14: 100%|██████████| 358/358 [00:56<00:00,  

<br><br><br><br>

## 7. 성능 측정하기

In [40]:
examples = [
    "지루하다, 놀러가고 싶어.",
    "오늘 일찍 일어났더니 피곤하다.",
    "간만에 여자친구랑 데이트 하기로 했어.",
    "집에 있는다는 소리야."
]

In [59]:
# 번역 생성 함수
def evaluate(sentence, model, tokenizer):
    mecab = Mecab()
    sentence = mecab.morphs(preprocess_sentence(sentence))
    sentence = tokenizer.texts_to_sequences(sentence)
    _input = tf.keras.preprocessing.sequence.pad_sequences([sentence], maxlen=enc_train.shape[-1], padding='post')
    
    ids = []
    output = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(_input, output)
        print(enc_padding_mask.shape, combined_mask.shape, dec_padding_mask.shape)
        
        predictions, enc_attns, dec_attns, dec_enc_attns = model(_input, output, enc_padding_mask, combined_mask, dec_padding_mask)

        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()
        
        # 숫자를 문자열로 복원
        if tokenizer.word_index['<end>'] == predicted_id:
            result = ' '.join(tokenizer.sequences_to_texts(idx))
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)
    
    result = ' '.join(tokenizer.sequences_to_texts(idx))
    
    return pieces, result, enc_attns, dec_attns, dec_enc_attns

In [67]:
# 번역 생성
def translate(sentence, model, tokenizer):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = evaluate(sentence, model, tokenizer)
    return result

In [68]:
# transformer1
for sen in examples:
    print(f'Q : {sen}')
    print(f'A : {translate(sen, transformer1, tokenizer)}')

Q : 지루하다, 놀러가고 싶어.
(1, 1, 1, 25, 1) (1, 1, 1, 25, 25) (1, 1, 1, 1)


InvalidArgumentError: Input to reshape is a tensor with 512 values, but the requested shape has 25 [Op:Reshape]

InvalidArgumentError: Input to reshape is a tensor with 512 values, but the requested shape has 25 [Op:Reshape]