# Evn

In [None]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

Namespace(seed=1234)


In [None]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Simple Project

## Data

In [None]:
# 입력 문장
sentences = [
    "나는 학생 입니다",
    "당신은 수학 선생님 입니다",
    "나는 선생님 입니다",
    "당신은 수학 학생 입니다"
]

# 출력 정답
targets = set(["▁학생", "학생", "▁선생님", "선생님", "▁수학", "수학"])

In [None]:
id_to_label = {0: "기타", 1: "명사"}

In [None]:
train_sentences = sentences[:2]

train_sentences

In [None]:
valid_sentences = sentences[2:3]

valid_sentences

In [None]:
test_sentences = sentences[3:]

test_sentences

## Config

In [None]:
args.n_seq = 6  # 입력 문장 길이
args.n_vocab = len(vocab)  # 단어벡터 수
args.d_model = 4  # 단어벡터 차원
args.n_out = 2  # 출력 수
args

## Train, Valid, Test 데이터 생성

In [None]:
def make_data(sentences, n_seq=6):
    inputs, labels = [], []

    for sentence in sentences:
        _input = vocab.encode_as_pieces(sentence)
        _input = _input[:n_seq]
        _input += ['[PAD]'] * (n_seq - len(_input))
        inputs.append(vocab.piece_to_id(_input))

        _label = [1 if n in targets else 0 for n in _input]
        _label = _label[:n_seq]
        _label += [0] * (n_seq - len(_label))
        labels.append(_label)

    inputs = np.array(inputs)
    labels = np.array(labels)
    
    return inputs, labels

In [None]:
train_inputs, train_labels = make_data(train_sentences, n_seq=args.n_seq)
train_inputs, train_labels

In [None]:
valid_inputs, valid_labels = make_data(valid_sentences, n_seq=args.n_seq)
valid_inputs, valid_labels

In [None]:
test_inputs, test_labels = make_data(test_sentences, n_seq=args.n_seq)
test_inputs, test_labels

## Modeling

In [None]:
# 입력 단어를 vector로 변환
embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
hidden = embedding(train_inputs)
hidden

In [None]:
# embedding weight
weight = embedding.get_weights()[0]
weight

In [None]:
# numpy를 이용해서 직접 조회 (두 결과값 비교)
weight[train_inputs], hidden

In [None]:
# RNN, CNN, Attention, Linear ....

In [None]:
# 단어의 vector를 이용해서 기타(0), 명사(1) 확률값 예측
linear = tf.keras.layers.Dense(2, activation=tf.nn.softmax)
outputs = linear(hidden)
outputs

In [None]:
# dense의 wieght, bias
weight, bias = linear.get_weights()
weight, bias

In [None]:
# numpy를 이용한 Wx + b
logits = np.matmul(hidden, weight) + bias
logits

In [None]:
# softmax 계산을 위한 준비 exp(x') / sum(exp(x))
numerator = np.exp(logits)
denominator = np.sum(numerator, axis=2, keepdims=True)
numerator, denominator

In [None]:
# 두 결과값 비교
probs = numerator / denominator
probs, outputs

In [None]:
def build_model(args):
    """
    동작만 하는 간단한 모델
    :param args: input args
    """
    inputs = tf.keras.layers.Input((args.n_seq,))  # (bs, n_seq)
    # 입력 단어를 vector로 변환
    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden = embedding(inputs)  # (bs, n_seq, d_model)
    ########################################################
    # RNN, CNN, Attention, Dense ...
    ########################################################
    # 단어의 vector를 이용해서 정답 확률값 예측
    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    outputs = linear(hidden)  # (bs, n_seq, n_out)
    # 학습할 모델 선언
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
# 모델 생성
model = build_model(args)
# 모델 내용 그래프 출력
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict(train_inputs[:4])

## Train

In [None]:
# 모델 생성
model = build_model(args)
# 모델 내용 그래프 출력
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
# 모델 loss, optimizer, metric 정의
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
save_weights = tf.keras.callbacks.ModelCheckpoint("sequence_class.hdf5",
                                                  monitor='val_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger("sequence_class.csv")

In [None]:
# 모델 학습
history = model.fit(train_inputs, train_labels,
                    epochs=100,
                    batch_size=16,
                    validation_data=(valid_inputs, valid_labels),
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='acc')
plt.plot(history.history['val_accuracy'], 'k--', label='val_acc')
plt.xlabel('Epoch')
plt.legend()

plt.show()

## 평가

In [None]:
# 모델 생성
model = build_model(args)
# 모델 weight load
model.load_weights("sequence_class.hdf5")

In [None]:
# 모델 loss, optimizer, metric 정의
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 모델 평가
model.evaluate(test_inputs, test_labels)

## 배포

In [None]:
# 모델 생성
model = build_model(args)
# 모델 weight load
model.load_weights("sequence_class.hdf5")

In [None]:
# 추론할 입력
string = '수학 당신은 선생님 입니다'

In [None]:
# 입력을 숫자로 변경
infer_input = vocab.encode_as_ids(string)
infer_input

In [None]:
# numpy array 변환
infer_inputs = np.array([infer_input])
infer_inputs

In [None]:
# 기타(0), 명사(1) 추론
y_preds = model.predict(infer_inputs)
y_preds

In [None]:
# 확률의 max 값을 추론 값으로 결정
y_pred_class = np.argmax(y_preds, axis=2)
y_pred_class

In [None]:
# 각 예측 값에 대한 label string
for row in y_pred_class:
    for val in row:
        print(val, ':', id_to_label[val])

# 실습
- 아래 입력 문장과 출력 정답을 이용해 간단한 프로젝트를 구성해 보세요.

## Data

In [None]:
# 입력 문장
sentences = [
    "이것은 책상 입니다",
    "저것은 책상 의자 입니다",
]

# 출력 정답
targets = set(["▁책상", "책상", "▁의자", "의자"])

## Config

## Vocabulary

## Train, Valid, Test 데이터 생성

## Modeling

## Train

## 평가

## 배포