# Evn*

In [2]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [4]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
chat_dir = os.path.join(data_dir, "chat")
if not os.path.isdir(chat_dir):
    os.makedirs(chat_dir)
os.listdir(chat_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Tutorial

In [9]:
# 입력 문장
sentences = [
    ['나는 오늘 기분이 좋아', '네가 기분이 좋으니 나도 좋아'],
    ['나는 오늘 행복해', '나도 행복하다'],
]

In [None]:
# train source, target 데이터 생성
train_src_ids, tarin_tgt_ids = [], []
for pair in sentences:
    train_src_ids.append(vocab.encode_as_ids(pair[0]))
    tarin_tgt_ids.append(vocab.encode_as_ids(pair[1]))

train_src_ids, tarin_tgt_ids

In [None]:
# train enc_inputs, dec_inputs, dec_label 생성
train_enc_inputs, train_dec_inputs, train_dec_labels = [], [], []
for source_id, target_id in zip(train_src_ids, tarin_tgt_ids):
    train_enc_inputs.append(source_id)
    train_dec_inputs.append([vocab.bos_id()] + target_id)
    train_dec_labels.append(target_id + [vocab.eos_id()])

train_enc_inputs, train_dec_inputs, train_dec_labels

In [None]:
# 문장의 길이를 모두 동일하게 변경 (최대길이 5)
for row in train_enc_inputs:
    row += [0] * (5 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_dec_inputs:
    row += [0] * (10 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_dec_labels:
    row += [0] * (10 - len(row))

train_enc_inputs, train_dec_inputs, train_dec_labels

In [None]:
# numpy array로 변환
train_enc_inputs = np.array(train_enc_inputs)
train_dec_inputs = np.array(train_dec_inputs)
train_dec_labels = np.array(train_dec_labels)

train_enc_inputs, train_dec_inputs, train_dec_labels

In [14]:
# 단어를 벡터로 변환
embedding = tf.keras.layers.Embedding(len(vocab), 4)

In [None]:
# encoder inputs 단어 벡터
enc_hidden = embedding(train_enc_inputs)  # (bs, n_seq, 4)
enc_hidden

In [16]:
# encoder LSTM
enc_lstm = tf.keras.layers.LSTM(units=5, return_sequences=True, return_state=True)

In [None]:
# encoder LSTM 실행
enc_hidden, enc_h_state, enc_c_state = enc_lstm(enc_hidden)
enc_hidden, enc_h_state, enc_c_state

In [None]:
# decoder inputs 단어 벡터
dec_hidden = embedding(train_dec_inputs)  # (bs, n_seq, 4)
dec_hidden

In [19]:
# decoder LSTM
dec_lstm = tf.keras.layers.LSTM(units=5, return_sequences=True)

In [None]:
# decoder LSTM 실행
dec_hidden = dec_lstm(dec_hidden, initial_state=[enc_h_state, enc_c_state])
dec_hidden

Attention score: $e = s^Th \in \mathbb{R}^{m \times n}$

In [None]:
# attention score
attn_score = tf.matmul(dec_hidden, enc_hidden, transpose_b=True)
attn_score

Attention prob: $\alpha = softmax(e) \in \mathbb{R}^{m \times n}$

In [None]:
# attention prob
attn_prob = tf.nn.softmax(attn_score, axis=-1)
attn_prob

Attention output: $a = h \alpha^T \in \mathbb{R}^{h \times m}$

In [None]:
# attention output
attn_out = tf.matmul(enc_hidden, attn_prob, transpose_a=True, transpose_b=True)
attn_out

In [None]:
tf.transpose(attn_out, perm=[0, 2, 1])

In [None]:
# attention output (transpose)
attn_out = tf.matmul(attn_prob, enc_hidden)
attn_out

In [None]:
# concat
cat_hidden = tf.concat([attn_out, dec_hidden], axis=-1)
cat_hidden

In [27]:
# 다음단어 예측 layer
linear = tf.keras.layers.Dense(len(vocab), activation=tf.nn.softmax)

In [None]:
# 다음단어 예측 실행
y_pred = linear(cat_hidden)
y_pred

In [None]:
# CE loss
tf.keras.losses.SparseCategoricalCrossentropy()(train_dec_labels, y_pred)

In [None]:
args.n_vocab = len(vocab)
args.d_model = 32
args

## 실습
- 아래 모델에 Dot Product Attention을 적용해 보세요.

In [31]:
def build_model(args):
    enc_inputs = tf.keras.layers.Input((None,), name="enc_inputs")  # (bs, n_enc_seq)
    dec_inputs = tf.keras.layers.Input((None,), name="dec_inputs")  # (bs, n_dec_seq)
    ################################
    # Embedding
    ################################
    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model, name="embedding")
    enc_hidden = embedding(enc_inputs)  # (bs, n_enc_seq, d_model)
    dec_hidden = embedding(dec_inputs)  # (bs, n_dec_seq, d_model)
    ################################
    # Encoder
    ################################
    enc_lstm = tf.keras.layers.LSTM(units=args.d_model, return_state=True, name="enc_lstm")
    enc_hidden, enc_h_state, enc_c_state = enc_lstm(enc_hidden)  # (bs, n_enc_seq, units), (bs, units), (bs, units)
    ################################
    # Decoder
    ################################
    dec_lstm = tf.keras.layers.LSTM(units=args.d_model, return_sequences=True, name="dec_lstm")
    dec_hidden = dec_lstm(dec_hidden, initial_state=[enc_h_state, enc_c_state])  # (bs, n_dec_seq, units)
    ################################
    # Attention
    ################################
    attn_score = tf.matmul(dec_hidden, enc_hidden, transpose_b=True)  # (bs, n_dec_seq, n_enc_seq)
    attn_prob = tf.nn.softmax(attn_score, axis=-1)  # (bs, n_dec_seq, n_enc_seq)
    attn_out = tf.matmul(attn_prob, enc_hidden) # (bs, n_dec_seq, units)
    cat_hidden = tf.concat([attn_out, dec_hidden], axis=-1)  # (bs, n_dec_seq, units * 2)
    ################################
    # Next word prediction
    ################################
    linear = tf.keras.layers.Dense(args.n_vocab, activation=tf.nn.softmax, name="out_linear")
    y_pred = linear(cat_hidden)
    ################################
    # Model
    ################################
    model = tf.keras.Model(inputs=(enc_inputs, dec_inputs), outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_enc_inputs, train_dec_inputs))

# 실습
- 아래 데이터를 이용해서 attention이 적용된 대화모델 프로젝트를 구성해 보세요.

In [34]:
# 입력 문장
sentences = [
    ['안녕 만나서 반가워', '안녕하세요 반갑습니다'],
    ['너는 누구니', '저는 마음을 주는 위로봇 입니다'],
]

# Data*

In [None]:
# 파일 다운로드 및 목록 확인
!wget https://github.com/songys/Chatbot_data/raw/master/ChatbotData.csv
os.listdir('./')

# Loss & Acc*

In [None]:
def lm_loss(y_true, y_pred):
    """
    pad 부분을 제외하고 loss를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 loss 값
    """
    # loss 계산 (각각)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(y_true, y_pred)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    loss *= mask
    # mask를 제외한 나머지 부분의 평균
    loss = tf.reduce_sum(loss) / tf.maximum(1., tf.reduce_sum(mask))
    return loss

In [None]:
def lm_acc(y_true, y_pred):
    """
    pad 부분을 제외하고 accuracy를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 accuracy 값
    """
    y_true = tf.cast(y_true, tf.float32)
    # 예측 class
    y_class = tf.cast(tf.argmax(y_pred, axis=-1), tf.float32)
    # 예측값과 정답 비교
    matches = tf.cast(tf.equal(y_true, y_class), tf.float32)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    matches *= mask
    # mask를 제외한 나머지 부분의 accuracy
    accuracy = tf.reduce_sum(matches) / tf.maximum(1., tf.reduce_sum(mask))
    return accuracy

# 실습
- 전체 데이터를 이용해 attention이 적용된 chatbot을 학습해보세요.

## Train 데이터 생성

## Modeling

## Train

## 평가

## 배포