# Evn*

In [2]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [4]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
chat_dir = os.path.join(data_dir, "chat")
if not os.path.isdir(chat_dir):
    os.makedirs(chat_dir)
os.listdir(chat_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Tutorial

In [9]:
# 입력 문장
sentences = [
    ['나는 오늘 기분이 좋아', '네가 기분이 좋으니 나도 좋아'],
    ['나는 오늘 행복해', '나도 행복하다'],
]

In [None]:
# train source, target 데이터 생성
train_src_ids, tarin_tgt_ids = [], []
for pair in sentences:
    train_src_ids.append(vocab.encode_as_ids(pair[0]))
    tarin_tgt_ids.append(vocab.encode_as_ids(pair[1]))

train_src_ids, tarin_tgt_ids

In [None]:
train_src_ids

In [None]:
# train enc_inputs, dec_inputs, dec_label 생성
train_enc_inputs, train_dec_inputs, train_dec_labels = [], [], []

for source_id, target_id in zip(train_src_ids, tarin_tgt_ids):
    train_enc_inputs.append(source_id)
    train_dec_inputs.append([vocab.bos_id()] + target_id)
    train_dec_labels.append(target_id + [vocab.eos_id()])

train_enc_inputs, train_dec_inputs, train_dec_labels

In [None]:
# 문장의 길이를 모두 동일하게 변경 (최대길이 5)
for row in train_enc_inputs:
    row += [0] * (5 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_dec_inputs:
    row += [0] * (10 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_dec_labels:
    row += [0] * (10 - len(row))

train_enc_inputs, train_dec_inputs, train_dec_labels

In [None]:
# numpy array로 변환
train_enc_inputs = np.array(train_enc_inputs)
train_dec_inputs = np.array(train_dec_inputs)
train_dec_labels = np.array(train_dec_labels)

train_enc_inputs, train_dec_inputs, train_dec_labels

In [15]:
# 단어를 벡터로 변환
embedding = tf.keras.layers.Embedding(len(vocab), 4, mask_zero=True)

In [None]:
# encoder inputs 단어 벡터
enc_hidden = embedding(train_enc_inputs)  # (bs, n_seq, 4)
enc_hidden

In [None]:
enc_mask = embedding.compute_mask(train_enc_inputs)
enc_mask

In [18]:
# encoder LSTM
enc_lstm = tf.keras.layers.LSTM(units=5, return_state=True)

In [None]:
# encoder LSTM 실행
enc_hidden, enc_h_state, enc_c_state = enc_lstm(enc_hidden, mask=enc_mask)
enc_hidden, enc_h_state, enc_c_state

In [None]:
# decoder inputs 단어 벡터
dec_hidden = embedding(train_dec_inputs)  # (bs, n_seq, 4)
dec_hidden

In [None]:
dec_mask = embedding.compute_mask(train_dec_inputs)
dec_mask

In [22]:
# decoder LSTM
dec_lstm = tf.keras.layers.LSTM(units=5, return_sequences=True)

In [None]:
# decoder LSTM 실행
dec_hidden = dec_lstm(dec_hidden, mask=dec_mask, initial_state=[enc_h_state, enc_c_state])
dec_hidden

In [24]:
# 다음단어 예측 layer
linear = tf.keras.layers.Dense(len(vocab), activation=tf.nn.softmax)

In [None]:
# 다음단어 예측 실행
y_pred = linear(dec_hidden)
y_pred

In [None]:
# CE loss
tf.keras.losses.SparseCategoricalCrossentropy()(train_dec_labels, y_pred)

In [None]:
args.n_vocab = len(vocab)
args.d_model = 32
args

In [28]:
def build_model(args):
    enc_inputs = tf.keras.layers.Input((None,), name="enc_inputs")  # (bs, n_enc_seq)
    dec_inputs = tf.keras.layers.Input((None,), name="dec_inputs")  # (bs, n_dec_seq)
    
    ################################
    # 모델을 완성하세요.
    ################################
    
    model = tf.keras.Model(inputs=(enc_inputs, dec_inputs), outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_enc_inputs, train_dec_inputs))

# 실습
- 아래 데이터를 이용해서 대화모델 프로젝트를 구성해 보세요.

In [31]:
# 입력 문장
sentences = [
    ['안녕 만나서 반가워', '안녕하세요 반갑습니다'],
    ['너는 누구니', '저는 마음을 주는 위로봇 입니다'],
]

# Data*

In [None]:
# 파일 다운로드 및 목록 확인
!wget https://github.com/songys/Chatbot_data/raw/master/ChatbotData.csv
os.listdir('./')

# Loss & Acc*

In [33]:
def lm_loss(y_true, y_pred):
    """
    pad 부분을 제외하고 loss를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 loss 값
    """
    # loss 계산 (각각)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(y_true, y_pred)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    loss *= mask
    # mask를 제외한 나머지 부분의 평균
    loss = tf.reduce_sum(loss) / tf.maximum(1., tf.reduce_sum(mask))
    return loss

In [34]:
def lm_acc(y_true, y_pred):
    """
    pad 부분을 제외하고 accuracy를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 accuracy 값
    """
    y_true = tf.cast(y_true, tf.float32)
    # 예측 class
    y_class = tf.cast(tf.argmax(y_pred, axis=-1), tf.float32)
    # 예측값과 정답 비교
    matches = tf.cast(tf.equal(y_true, y_class), tf.float32)
    # 0이면 0, 아니면 1
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    # mask 부분을 0으로 변경
    matches *= mask
    # mask를 제외한 나머지 부분의 accuracy
    accuracy = tf.reduce_sum(matches) / tf.maximum(1., tf.reduce_sum(mask))
    return accuracy

# Small Data Project

## Train 데이터 생성

In [None]:
# data load
df_train = pd.read_csv('ChatbotData.csv')
df_train

In [None]:
# null 제거
df_train = df_train.dropna()
df_train

In [None]:
# Q 길이
q_length = df_train["Q"].astype("str").apply(lambda x:len(vocab.encode_as_pieces(x)))
q_length.head(10), q_length.max()

In [None]:
# A 길이
a_length = df_train["A"].astype("str").apply(lambda x:len(vocab.encode_as_pieces(x)))
a_length.head(10), a_length.max()

In [None]:
# 랜덤하게 10개만 확인
df_train = df_train.sample(10)
df_train

### Tutorial

In [40]:
n_enc_seq = 27
n_dec_seq = 37

n_enc_max = n_enc_seq
n_dec_max = n_dec_seq - 1  # [BOS] or [EOS]
df = df_train

In [None]:
# 데이터 생성
for i, row in tqdm(df.iterrows(), total=len(df)):
    Q = row['Q']
    A = row['A']
    print(Q, '/', A)

In [None]:
# 데이터 생성
for i, row in tqdm(df.iterrows(), total=len(df)):
    Q = row['Q']
    A = row['A']
    # tokenize
    tokens_q = vocab.encode_as_pieces(Q)
    tokens_a = vocab.encode_as_pieces(A)
    # 최대 길이로 자르기
    tokens_q = tokens_q[:n_enc_max]
    tokens_a = tokens_a[:n_dec_max]
    print(tokens_q, '/', tokens_a)

In [None]:
# 데이터 생성
for i, row in tqdm(df.iterrows(), total=len(df)):
    Q = row['Q']
    A = row['A']
    # tokenize
    tokens_q = vocab.encode_as_pieces(Q)
    tokens_a = vocab.encode_as_pieces(A)
    # 최대 길이로 자르기
    tokens_q = tokens_q[:n_enc_max]
    tokens_a = tokens_a[:n_dec_max]
    # input & label 정의
    enc_input = tokens_q
    dec_input = [vocab.bos_id()] + tokens_a
    dec_label = tokens_a + [vocab.eos_id()]
    print(enc_input, '/', dec_input, '/', dec_label)

In [None]:
enc_inputs = []
dec_inputs = []
dec_labels = []
# 데이터 생성
for i, row in tqdm(df.iterrows(), total=len(df)):
    Q = row['Q']
    A = row['A']
    # tokenize
    tokens_q = vocab.encode_as_ids(Q)
    tokens_a = vocab.encode_as_ids(A)
    # 최대 길이로 자르기
    tokens_q = tokens_q[:n_enc_max]
    tokens_a = tokens_a[:n_dec_max]
    # input & label 정의
    enc_input = tokens_q
    dec_input = [vocab.bos_id()] + tokens_a
    dec_label = tokens_a + [vocab.eos_id()]
    # pad 추가
    enc_input += [0] * (n_enc_seq - len(enc_input))
    dec_input += [0] * (n_dec_seq - len(dec_input))
    dec_label += [0] * (n_dec_seq - len(dec_label))
    print(enc_input, '/', dec_input, '/', dec_label)
    # 값 저장
    enc_inputs.append(enc_input)
    dec_inputs.append(dec_input)
    dec_labels.append(dec_label)

### 실습
- 아래 함수를 완성하세요.

In [45]:
def make_data(df, vocab, n_enc_seq, n_dec_seq):
    """
    chat 학습 데이터 생성
    :param df: data frame
    :param df: vocab
    :param n_enc_seq: number of encoder sequence
    :param n_dec_seq: number of decoder sequence
    :return enc_inputs: encoder input data
    :return dec_inputs: decoder input data
    :return dec_labels: decoder label data
    """
    n_enc_max = n_enc_seq
    n_dec_max = n_dec_seq - 1  # [BOS] or [EOS]
    # inputa & labels
    enc_inputs = []
    dec_inputs = []
    dec_labels = []

    ##################################
    # 함수를 완성하세요.
    ##################################
    
    # to numpy array
    enc_inputs = np.array(enc_inputs)
    dec_inputs = np.array(dec_inputs)
    dec_labels = np.array(dec_labels)
    return enc_inputs, dec_inputs, dec_labels

In [None]:
train_enc_inputs, train_dec_inputs, train_dec_labels = make_data(df_train, vocab, 27, 37)
train_enc_inputs, train_dec_inputs, train_dec_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args

In [None]:
def build_model(args):
    enc_inputs = tf.keras.layers.Input((None,), name="enc_inputs")  # (bs, n_enc_seq)
    dec_inputs = tf.keras.layers.Input((None,), name="dec_inputs")  # (bs, n_dec_seq)

    ################################
    # 모델을 완성하세요.
    ################################

    model = tf.keras.Model(inputs=(enc_inputs, dec_inputs), outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_enc_inputs[:4], train_dec_inputs[:4]))

## Train

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [51]:
model.compile(loss=lm_loss, optimizer="adam", metrics=[lm_acc])

In [52]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='lm_acc', patience=30)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(chat_dir, "lstm.hdf5"),
                                                  monitor='lm_acc',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(chat_dir, "lstm.csv"))

In [None]:
history = model.fit((train_enc_inputs, train_dec_inputs),
                    train_dec_labels,
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['lm_acc'], 'g-', label='accuracy')

plt.show()

## BLEU

In [55]:
candidate = 'the the the the the the the'.split()
references = [
    'the cat is on the mat'.split(),
    'there is a cat on the mat'.split()
]

In [56]:
def get_ngram(tokens, ngram):
    # tokens n-gram
    counter = collections.defaultdict(int)
    for i in range(len(tokens) - ngram + 1):
        # n-gram 별 count
        key = tuple(tokens[i:i+ngram])
        counter[key] += 1
    return  counter

In [None]:
get_ngram(candidate, 1)

In [58]:
def get_ngram_clip(candidate, references, ngram):
    # max ref counter
    max_counter = collections.defaultdict(int)
    # check max count
    for ref in references:
        ref_counter = get_ngram(ref, ngram)
        for key, cnt in ref_counter.items():
            max_counter[key] = max(max_counter[key], ref_counter.get(key, 0))
    
    # candidate counter
    can_counter = get_ngram(candidate, ngram)
    # clip counter
    clip_counter = collections.defaultdict(int)
    for key, cnt in can_counter.items():
        clip_counter[key] = min(can_counter[key], max_counter.get(key, 0))
    return clip_counter

In [None]:
get_ngram_clip(candidate, references, 1)

In [60]:
def get_ngram_precision(candidate, references, ngram):
    # clip counter
    clip_counter = get_ngram_clip(candidate, references, ngram) 
    n_clip = sum(clip_counter.values())
    # ref counter
    can_counter = get_ngram(candidate, ngram)
    n_total = sum(can_counter.values())

    return n_clip / max(n_total, 1)  # 분모 0 방지

In [None]:
get_ngram_precision(candidate, references, 1)

In [62]:
def get_brevity_penalty(candidate, references):
    can_len = len(candidate)
    ref_len = min([len(reference) for reference in references])

    if can_len == 0:  # empty string
        return 0
    elif can_len > ref_len:
        return 1
    else:
        return np.exp(1 - ref_len / can_len)

In [None]:
get_brevity_penalty(candidate, references)

In [64]:
def get_bleu_score(candidate, references, weights=[0.25, 0.25, 0.25, 0.25]):
    bp = get_brevity_penalty(candidate, references) # BP
    p_n = []
    for i in range(len(weights)):
        p_n.append(get_ngram_precision(candidate, references, i + 1))
    score = [w * np.log(p) if p != 0 else 0 for p, w in zip(p_n, weights)]
    score = np.sum(score)
    score = bp * np.exp(score)
    return score

In [None]:
get_bleu_score(candidate, references)

In [66]:
import nltk.translate.bleu_score as bleu
from nltk import ngrams

In [None]:
bleu.sentence_bleu(references, candidate)

In [68]:
candidate = 'It is a guide to action which ensures that the military always obeys the commands of the party'.split()
references = [
    'It is a guide to action that ensures that the military will forever heed Party commands'.split(),
    'It is the guiding principle which guarantees the military forces always being under the command of the Party'.split(),
    'It is the practical guide for the army always to heed the directions of the party'.split()
]

In [None]:
get_bleu_score(candidate, references)

In [None]:
bleu.sentence_bleu(references, candidate)

## 평가

In [71]:
model = build_model(args)
model.load_weights(os.path.join(chat_dir, "lstm.hdf5"))

In [72]:
model.compile(loss=lm_loss, optimizer="adam", metrics=[lm_acc])

In [73]:
# 100개만 확인
valid_enc_inputs = train_enc_inputs[:100]
valid_dec_inputs = train_dec_inputs[:100]
valid_dec_labels = train_dec_labels[:100]

In [None]:
# 평가
model.evaluate((valid_enc_inputs, valid_dec_inputs), valid_dec_labels, batch_size=128)

In [None]:
references = []
for row in valid_dec_labels:
    ids = []
    for i in row:
        if i == vocab.eos_id():
            break
        ids.append(int(i))
    string = vocab.id_to_piece(ids)
    references.append(string)
references

In [None]:
# 예측
y_pred = model.predict((valid_enc_inputs, valid_dec_inputs))
y_pred

In [None]:
# greedy decoding
y_pred_class = tf.argmax(y_pred, axis=-1).numpy()
y_pred_class

In [None]:
candidates = []
for row in y_pred_class:
    ids = []
    for i in row:
        if i == vocab.eos_id():
            break
        ids.append(int(i))
    string = vocab.id_to_piece(ids)
    candidates.append(string)
candidates

In [None]:
bleu_scores = []
for reference, candidate in zip(references, candidates):
    bleu_score = bleu.sentence_bleu([reference], candidate)
    bleu_scores.append(bleu_score)
    print(bleu_score, ":", reference, "/", candidate)
np.mean(bleu_scores)

## 배포

In [80]:
model = build_model(args)
model.load_weights(os.path.join(chat_dir, "lstm.hdf5"))

In [81]:
def do_chat(vocab, model, n_dec_seq, string):
    """
    seq2seq chat
    :param vocab: vocab
    :param model: model
    :param n_dec_seq: number of dec seqence
    :param string: inpust string
    """
    # qeustion
    q = vocab.encode_as_pieces(string)
    q_id = [vocab.piece_to_id(p) for p in q]

    # answer
    a_id = [vocab.bos_id()]

    # 처음부터 예측
    start_idx = 0

    for _ in range(start_idx, n_dec_seq - 1):
        # print(q_id)
        # print(a_id)
        outputs = model.predict((np.array([q_id]), np.array([a_id])))
        prob = outputs[0][start_idx]
        word_id = int(np.argmax(prob))
        # print(word_id)
        if word_id == vocab.eos_id():
            break
        a_id.append(word_id)
        start_idx += 1
    predict_id = a_id[1:start_idx + 1]
    # print(predict_id)
    predict_str = vocab.decode_ids(predict_id)
    return predict_str

In [None]:
string = '남에게 피해주지 않는 건 기본이죠.'
do_chat(vocab, model, 40, string)

In [None]:
while True:
    string = input('질문 > ')
    string = string.strip()
    if len(string) == 0:
        break
    predict_str = do_chat(vocab, model, 40, string)
    print(f'답변 > {predict_str}')

# 실습
- 전체 데이터를 이용해 chatbot을 학습해보세요.

## Train 데이터 생성

## Modeling

## Train

## 평가

## 배포