# Evn*

In [2]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [None]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/Data/nlp'
os.listdir(data_dir)

In [None]:
sychat_dir = os.path.join(data_dir, "songys-chat")
if not os.path.isdir(sychat_dir):
    os.makedirs(sychat_dir)
os.listdir(sychat_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Transformer

In [None]:
args.d_model = 256  # d_model: model hidden dim
args.n_head = 4  # n_head: multi head attention head number
args.d_head = 64  # d_head: multi head attention head dim
args.dropout = 0.1  # dropout: dropout rate
args.d_ff = 1024  # d_ff: feed forward dim
args.norm_eps = 1e-9  # norm_eps: layernormal epsilon
args.n_layer = 3  # n_layer: layer number
args.n_seq = 128  # n_seq: sequence max number
args.n_vocab = len(vocab)  # n_vocab: vocab count
args.i_pad = vocab.pad_id()  # i_pad: vocab pad id

args

In [None]:
def get_pad_mask(tokens, i_pad=0):
    """
    pad mask 계산하는 함수
    :param tokens: tokens (bs, n_seq)
    :param i_pad: id of pad
    :return mask: pad mask (pad: 1, other: 0)
    """
    # pad: True, others: False
    mask = tf.math.equal(tokens, i_pad)
    # boolean -> float 32
    mask = tf.cast(mask, tf.float32)
    # expand dimension for Q n_seq
    mask = tf.expand_dims(mask, axis=1)
    return mask

In [None]:
def get_causal_mask(tokens, i_pad=0):
    """
    causal mask 계산하는 함수
    :param tokens: tokens (bs, n_seq)
    :param i_pad: id of pad
    :return mask: causal and pad mask (causal or pad: 1, other: 0)
    """
    # n_seq 조회
    n_seq = tf.shape(tokens)[1]
    # all one mask
    mask = tf.ones((n_seq, n_seq))
    # make reverse causal mask
    mask = tf.linalg.band_part(mask, -1, 0)
    # 0 -> 1, 1 -> 0
    mask = 1 - mask
    # expand dim for bs
    mask = tf.expand_dims(mask, axis=0)
    # get pad_mask
    pad_mask = get_pad_mask(tokens, i_pad)
    # mask all causal_mask or pad_mask
    mask = tf.maximum(mask, pad_mask)
    return mask

In [None]:
class ScaleDotProductAttention(tf.keras.layers.Layer):
    """
    Scale Dot Product Attention Class
    """
    def __init__(self, name="scale_dot_product_attention"):
        """
        생성자
        :param name: layer name
        """
        super().__init__(name=name)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: Q, K, V, attn_mask tuple
        :return attn_out: attention 실행 결과
        """
        Q, K, V, attn_mask = inputs
        # matmul Q, K.T
        attn_score = tf.matmul(Q, K, transpose_b=True)
        # d_k
        d_k = tf.cast(tf.shape(K)[-1], tf.float32)
        # scale = d_k ** 0.5
        scale = tf.math.sqrt(d_k)
        # divide by scale
        attn_scale = tf.math.divide(attn_score, scale)
        # do mask (subtract 1e-9 for masked value)
        attn_scale -= 1.e9 * attn_mask
        # calculate attention prob
        attn_prob = tf.nn.softmax(attn_scale, axis=-1)
        # weighted sum of V
        attn_out = tf.matmul(attn_prob, V)
        return attn_out

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    """
    Multi Head Attention Class
    """
    def __init__(self, args, name="multi_head_attention"):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.d_model = args.d_model
        self.n_head = args.n_head
        self.d_head = args.d_head

        # Q, K, V input dense layer
        self.W_Q = tf.keras.layers.Dense(self.n_head * self.d_head)
        self.W_K = tf.keras.layers.Dense(self.n_head * self.d_head)
        self.W_V = tf.keras.layers.Dense(self.n_head * self.d_head)
        # Scale Dot Product Attention class
        self.attention = ScaleDotProductAttention(name="self_attention")
        # output dense layer
        self.W_O = tf.keras.layers.Dense(self.d_model)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: Q, K, V, attn_mask tuple
        :return attn_out: attention 실행 결과
        """
        Q, K, V, attn_mask = inputs
        # build multihead Q, K, V
        Q_m = tf.transpose(tf.reshape(self.W_Q(Q), [-1, tf.shape(Q)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, n_head, Q_len, d_head)
        K_m = tf.transpose(tf.reshape(self.W_K(K), [-1, tf.shape(K)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, n_head, K_len, d_head)
        V_m = tf.transpose(tf.reshape(self.W_V(V), [-1, tf.shape(V)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, n_head, K_len, d_head)
        # build multihead mask
        attn_mask_m = tf.expand_dims(attn_mask, axis=1)
        # Scale Dot Product Attention with multi head Q, K, V, attn_mask
        attn_out_m = self.attention((Q_m, K_m, V_m, attn_mask_m))  # (bs, n_head, Q_len, d_head)
        # transpose
        attn_out_t = tf.transpose(attn_out_m, perm=[0, 2, 1, 3])   # (bs, n_head, Q_len, d_head) -> (bs, Q_len, n_head, d_head)
        # reshape
        attn_out_c = tf.reshape(attn_out_t, [-1, tf.shape(Q)[1], self.n_head * self.d_head])  # (bs, Q_len, n_head, d_head) -> (bs, Q_len, n_head * d_head)
        # linear for output
        attn_out = self.W_O(attn_out_c) # (bs, Q_len, n_head * d_head) -> (bs, Q_len, d_model)
        return attn_out

In [None]:
class PositionWiseFeedForward(tf.keras.layers.Layer):
    """
    Position Wise Feed Forward Class
    """
    def __init__(self, args, name="feed_forward"):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.W_1 = tf.keras.layers.Dense(args.d_ff, activation=tf.nn.relu)
        self.W_2 = tf.keras.layers.Dense(args.d_model)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: inputs
        :return ff_val: feed forward 실행 결과
        """
        # linear W_1 and W_2
        ff_val = self.W_1(inputs)
        ff_val = self.W_2(ff_val)
        return ff_val

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    Encoder Layer Class
    """
    def __init__(self, args, name='encoder_layer'):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.self_attention = MultiHeadAttention(args)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=args.norm_eps)

        self.ffn = PositionWiseFeedForward(args)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=args.norm_eps)

        self.dropout = tf.keras.layers.Dropout(args.dropout)
 
    def call(self, inputs):
        """
        layer 실행
        :param inputs: enc_hidden, self_mask tuple
        :return enc_out: EncoderLayer 실행 결과
        """
        enc_hidden, self_mask = inputs
        # self attention
        self_attn_val = self.self_attention((enc_hidden, enc_hidden, enc_hidden, self_mask))
        # add and layer normal
        norm1_val = self.norm1(enc_hidden + self.dropout(self_attn_val))
        
        # feed forward
        ffn_val = self.ffn(norm1_val)
        # add and layer normal
        enc_out = self.norm2(norm1_val + self.dropout(ffn_val))

        return enc_out

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    Decoder Layer Class
    """
    def __init__(self, args, name='decoder_layer'):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.self_attention = MultiHeadAttention(args)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=args.norm_eps)

        self.ende_attn = MultiHeadAttention(args)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=args.norm_eps)

        self.ffn = PositionWiseFeedForward(args)
        self.norm3 = tf.keras.layers.LayerNormalization(epsilon=args.norm_eps)

        self.dropout = tf.keras.layers.Dropout(args.dropout)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: dec_hidden, enc_out, self_mask, ende_mask tuple
        :return dec_out: DecoderLayer 실행 결과
        """
        dec_hidden, enc_out, self_mask, ende_mask = inputs
        # self attention
        self_attn_val = self.self_attention((dec_hidden, dec_hidden, dec_hidden, self_mask))
        # add and layer normal
        norm1_val = self.norm1(dec_hidden + self.dropout(self_attn_val))

        # encoder and decoder attention
        ende_attn_val = self.ende_attn((norm1_val, enc_out, enc_out, ende_mask))
        # add and layer normal
        norm2_val = self.norm2(norm1_val + self.dropout(ende_attn_val))

        # feed forward
        ffn_val = self.ffn(norm2_val)
        # add and layer normal
        dec_out = self.norm3(norm2_val + self.dropout(ffn_val))

        return dec_out

In [None]:
class SharedEmbedding(tf.keras.layers.Layer):
    """
    Weighed Shaed Embedding Class
    """
    def __init__(self, args, name='weight_shared_embedding'):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.n_vocab = args.n_vocab
        self.d_model = args.d_model
    
    def build(self, input_shape):
        """
        shared weight 생성
        :param input_shape: Tensor Shape (not used)
        """
        with tf.name_scope('shared_embedding_weight'):
            self.shared_weights = self.add_weight(
                'weights',
                shape=[self.n_vocab, self.d_model],
                initializer=tf.keras.initializers.TruncatedNormal(stddev=self.d_model ** -0.5)
            )

    def call(self, inputs, mode='embedding'):
        """
        layer 실행
        :param inputs: 입력
        :param mode: 실행 모드
        :return: embedding or linear 실행 결과
        """
        # mode가 embedding일 경우 embedding lookup 실행
        if mode == 'embedding':
            return self._embedding(inputs)
        # mode가 linear일 경우 linear 실행
        elif mode == 'linear':
            return self._linear(inputs)
        # mode가 기타일 경우 오류 발생
        else:
            raise ValueError(f'mode {mode} is not valid.')
    
    def _embedding(self, inputs):
        """
        embedding lookup
        :param inputs: 입력
        """
        # lookup by gather
        embed = tf.gather(self.shared_weights, tf.cast(inputs, tf.int32))
        # muliply d_model ** 0.5
        embed *= self.d_model ** 0.5
        return embed

    def _linear(self, inputs):  # (bs, n_seq, d_model)
        """
        linear 실행
        :param inputs: 입력
        """
        # matmul inputs, shared_weights (transpose_b=True)
        outputs = tf.matmul(inputs, self.shared_weights, transpose_b=True)
        return outputs

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    """
    Positional Embedding Class
    """
    def __init__(self, args, name='position_embedding'):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)
        
        pos_encoding = PositionalEmbedding.get_sinusoid_encoding(args.n_seq, args.d_model)
        self.embedding = tf.keras.layers.Embedding(args.n_seq, args.d_model, trainable=False, weights=[pos_encoding])

    def call(self, inputs):
        """
        layer 실행
        :param inputs: 입력
        :return embed: positional embedding lookup 결과
        """
        # make position (0...n_seq)
        position = tf.math.cumsum(tf.ones_like(inputs), axis=1, exclusive=True)
        position = tf.cast(position, tf.int32)
        # embedding lookup
        embed = self.embedding(position)
        return embed

    @staticmethod
    def get_sinusoid_encoding(n_seq, d_model):
        """
        sinusoid encoding 생성
        :param n_seq: sequence number
        :param n_seq: model hidden dimension
        :return: positional encoding table
        """
        # calculate exp
        exs = np.array([2 * (i_ang // 2) / d_model for i_ang in range(d_model)])
        # calculate power
        angles = np.power(10000, exs)
        # make position
        pos = np.array([[i] for i in range(n_seq)])
        # position angle
        pos_encoding = pos / angles
        # sin even number
        pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
        # print(pos_encoding)
        # cos odd number
        pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
        # print(pos_encoding)
        return tf.cast(pos_encoding, tf.float32)

In [None]:
class Transformer(tf.keras.Model):
    """
    Transformer Class
    """
    def __init__(self, args, name='transformer'):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.i_pad = args.i_pad
        self.embedding = SharedEmbedding(args)
        self.position = PositionalEmbedding(args)
        
        self.encoder_layers = [EncoderLayer(args, name=f'encoder_layer_{i}') for i in range(args.n_layer)]
        self.decoder_layers = [DecoderLayer(args, name=f'decoder_layer_{i}') for i in range(args.n_layer)]

        self.dropout = tf.keras.layers.Dropout(args.dropout)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: enc_tokens, dec_tokens tuple
        :return logits: dec_tokens에 대한 다음 토큰 예측 결과 logits
        """
        enc_tokens, dec_tokens = inputs
        # encoder self attention mask
        enc_self_mask = get_pad_mask(enc_tokens, self.i_pad)
        # decoder self attention mask
        dec_self_mask = get_causal_mask(dec_tokens, self.i_pad)
        # encoder and decoder attention mask
        enc_dec_mask = get_pad_mask(enc_tokens, self.i_pad)

        # enc_tokens, dec_tokens embedding lookup
        enc_hidden = self.embedding(enc_tokens) + self.position(enc_tokens)
        enc_hidden = self.dropout(enc_hidden)

        # call encoder layers
        for encoder_layer in self.encoder_layers:
            enc_hidden = encoder_layer((enc_hidden, enc_self_mask))
        
        # dec_tokens embedding lookup
        dec_hidden = self.embedding(dec_tokens) + self.position(dec_tokens)
        dec_hidden = self.dropout(dec_hidden)

        # call decoder layers
        for decoder_layer in self.decoder_layers:
            dec_hidden = decoder_layer((dec_hidden, enc_hidden, dec_self_mask, enc_dec_mask))

        # call weight shared embedding (model=linear)
        logits = self.embedding(dec_hidden, mode='linear')
        return logits

In [None]:
def build_model(args):
    """
    Transformer Model
    :param args: Args 객체
    """
    enc_inputs = tf.keras.layers.Input((None,))  # (bs, ?)
    dec_inputs = tf.keras.layers.Input((None,))  # (bs, ?)

    transformer = Transformer(args)
    logits = transformer((enc_inputs, dec_inputs))
    y_pred = tf.keras.layers.Softmax(name="lm")(logits)

    model = tf.keras.Model(inputs=(enc_inputs, dec_inputs), outputs=y_pred)
    return model

# Data*

In [None]:
# 파일 다운로드 및 목록 확인
!wget https://github.com/songys/Chatbot_data/raw/master/ChatbotData.csv
os.listdir('./')

# Loss & Acc*

In [None]:
def lm_loss(y_true, y_pred):
    """
    pad 부분을 제외하고 loss를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 loss 값
    """
    # loss 계산 (각각)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(y_true, y_pred)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    loss *= mask
    # mask를 제외한 나머지 부분의 평균
    loss = tf.reduce_sum(loss) / tf.maximum(1., tf.reduce_sum(mask))
    return loss

In [None]:
def lm_acc(y_true, y_pred):
    """
    pad 부분을 제외하고 accuracy를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 accuracy 값
    """
    y_true = tf.cast(y_true, tf.float32)
    # 예측 class
    y_class = tf.cast(tf.argmax(y_pred, axis=-1), tf.float32)
    # 예측값과 정답 비교
    matches = tf.cast(tf.equal(y_true, y_class), tf.float32)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    matches *= mask
    # mask를 제외한 나머지 부분의 accuracy
    accuracy = tf.reduce_sum(matches) / tf.maximum(1., tf.reduce_sum(mask))
    return accuracy

# Scheduler*

In [None]:
class InverseSquareRootSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    TransformerSchedule class
    """
    def __init__(self, d_model, warmup_steps=4000):
        """
        생성자
        :param d_model: 모델 hidden
        :param warmup_steps: warmup steps
        """
        super().__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = tf.cast(warmup_steps, tf.float32)

    def __call__(self, step_num):
        """
        learning rate 계산
        :param step_num: 현재 step number
        :retrun: 계산된 learning rate
        """
        # calculate arg1 step_num ** -0.5
        arg1 = tf.math.rsqrt(step_num)
        # calculate arg2 step_num * warmup_steps ** -1.5
        arg2 = step_num * (self.warmup_steps**-1.5)
        # calcualte arg (min arg1 vs arg2)
        arg = tf.math.minimum(arg1, arg2)
        # calcualte lr (d_model ** -0.5 * arg)
        lr = tf.math.rsqrt(self.d_model) * arg
        return lr

In [None]:
schedule = InverseSquareRootSchedule(args.d_model, warmup_steps=100)
schedule(float(10))

In [None]:
# compute lr
test_schedule = InverseSquareRootSchedule(args.d_model, warmup_steps=100)
lrs = []
for step in range(1000):
    lrs.append(test_schedule(float(step)).numpy())

# draw
plt.plot(lrs, 'r-', label='learning_rate')
plt.xlabel('Step')
plt.show()

# 실습
- Transformer를 이용한 chatbot을 학습하세요.

## Train 데이터 생성

In [None]:
# data load
df_train = pd.read_csv('ChatbotData.csv')
df_train

In [None]:
# null 제거
df_train = df_train.dropna()
df_train

In [None]:
# Q 길이
q_length = df_train["Q"].astype("str").apply(lambda x:len(vocab.encode_as_pieces(x)))
q_length.head(10), q_length.max()

In [None]:
# A 길이
a_length = df_train["A"].astype("str").apply(lambda x:len(vocab.encode_as_pieces(x)))
a_length.head(10), a_length.max()

In [None]:
def make_data(df, vocab, n_enc_seq, n_dec_seq):
    """
    chat 학습 데이터 생성
    :param df: data frame
    :param df: vocab
    :param n_enc_seq: number of encoder sequence
    :param n_dec_seq: number of decoder sequence
    :return enc_inputs: encoder input data
    :return dec_inputs: decoder input data
    :return dec_labels: decoder label data
    """
    n_enc_max = n_enc_seq
    n_dec_max = n_dec_seq - 1  # [BOS] or [EOS]
    # inputa & labels
    enc_inputs = []
    dec_inputs = []
    dec_labels = []
    # 데이터 생성
    for i, row in tqdm(df.iterrows(), total=len(df)):
        Q = row['Q']
        A = row['A']
        # print(Q, '/', A)
        # tokenize
        tokens_q = vocab.encode_as_ids(Q)
        # print(len(tokens_q), ':', tokens_q)
        tokens_a = vocab.encode_as_ids(A)
        # print(len(tokens_a), ':', tokens_a)
        # 최대 길이로 자르기
        tokens_q = tokens_q[:n_enc_max]
        # print(len(tokens_q), ':', tokens_q)
        tokens_a = tokens_a[:n_dec_max]
        # print(len(tokens_a), ':', tokens_a)
        # input & label 정의
        enc_input = tokens_q
        # print(len(enc_input), ':', enc_input)
        dec_input = [vocab.bos_id()] + tokens_a
        # print(len(dec_input), ':', dec_input)
        dec_label = tokens_a + [vocab.eos_id()]
        # print(len(dec_label), ':', dec_label)
        # pad 추가
        enc_input += [0] * (n_enc_seq - len(enc_input))
        # print(len(enc_input), ':', enc_input)
        dec_input += [0] * (n_dec_seq - len(dec_input))
        # print(len(dec_input), ':', dec_input)
        dec_label += [0] * (n_dec_seq - len(dec_label))
        # print(len(dec_label), ':', dec_label)
        # 값 저장
        enc_inputs.append(enc_input)
        dec_inputs.append(dec_input)
        dec_labels.append(dec_label)
    # to numpy array
    enc_inputs = np.array(enc_inputs)
    dec_inputs = np.array(dec_inputs)
    dec_labels = np.array(dec_labels)
    return enc_inputs, dec_inputs, dec_labels

In [None]:
train_enc_inputs, train_dec_inputs, train_dec_labels = make_data(df_train, vocab, 27, 40)
train_enc_inputs, train_dec_inputs, train_dec_labels

## Modeling

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_enc_inputs[:4], train_dec_inputs[:4]))

## Train

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
learning_rate = InverseSquareRootSchedule(args.d_model, warmup_steps=1000)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
model.compile(loss=lm_loss, optimizer=optimizer, metrics=[lm_acc])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='lm_acc', patience=30)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(sychat_dir, "transformer.hdf5"),
                                                  monitor='lm_acc',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(sychat_dir, "transformer.csv"))

In [None]:
history = model.fit((train_enc_inputs, train_dec_inputs),
                    train_dec_labels,
                    epochs=100,
                    batch_size=256,
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['lm_acc'], 'g-', label='accuracy')

plt.show()

## BLEU

In [None]:
import nltk.translate.bleu_score as bleu
from nltk import ngrams

## 평가

In [None]:
model = build_model(args)
model.load_weights(os.path.join(sychat_dir, "transformer.hdf5"))

In [None]:
model.compile(loss=lm_loss, optimizer=optimizer, metrics=[lm_acc])

In [None]:
# 100개만 확인
valid_enc_inputs = train_enc_inputs[:100]
valid_dec_inputs = train_dec_inputs[:100]
valid_dec_labels = train_dec_labels[:100]

In [None]:
# 평가
model.evaluate((valid_enc_inputs, valid_dec_inputs), valid_dec_labels, batch_size=128)

In [None]:
references = []
for row in valid_dec_labels:
    ids = []
    for i in row:
        if i == vocab.eos_id():
            break
        ids.append(int(i))
    string = vocab.id_to_piece(ids)
    references.append(string)
references

In [None]:
# 예측
y_pred = model.predict((valid_enc_inputs, valid_dec_inputs))
y_pred

In [None]:
# greedy decoding
y_pred_class = tf.argmax(y_pred, axis=-1).numpy()
y_pred_class

In [None]:
candidates = []
for row in y_pred_class:
    ids = []
    for i in row:
        if i == vocab.eos_id():
            break
        ids.append(int(i))
    string = vocab.id_to_piece(ids)
    candidates.append(string)
candidates

In [1]:
bleu_scores = []
for reference, candidate in zip(references, candidates):
    bleu_score = bleu.sentence_bleu([reference], candidate)
    bleu_scores.append(bleu_score)
    print(bleu_score, ":", reference, "/", candidate)
np.mean(bleu_scores)

NameError: ignored

## 배포

In [None]:
model = build_model(args)
model.load_weights(os.path.join(sychat_dir, "transformer.hdf5"))

In [None]:
def do_chat(vocab, model, n_dec_seq, string):
    """
    seq2seq chat
    :param vocab: vocab
    :param model: model
    :param n_dec_seq: number of dec seqence
    :param string: inpust string
    """
    # qeustion
    q_id = vocab.encode_as_ids(string)

    # answer
    a_id = [vocab.bos_id()]

    # 처음부터 예측
    start_idx = 0

    for _ in range(start_idx, n_dec_seq - 1):
        # print(q_id)
        # print(a_id)
        outputs = model.predict((np.array([q_id]), np.array([a_id])))
        prob = outputs[0][start_idx]
        word_id = int(np.argmax(prob))
        # print(word_id)
        if word_id == vocab.eos_id():
            break
        a_id.append(word_id)
        start_idx += 1
    predict_id = a_id[1:start_idx + 1]
    # print(predict_id)
    predict_str = vocab.decode_ids(predict_id)
    return predict_str

In [None]:
string = '남에게 피해주지 않는 건 기본이죠.'
do_chat(vocab, model, 40, string)

In [None]:
while True:
    string = input('질문 > ')
    string = string.strip()
    if len(string) == 0:
        break
    predict_str = do_chat(vocab, model, 40, string)
    print(f'답변 > {predict_str}')