# Evn*

In [2]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [4]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
kowiki_dir = os.path.join(data_dir, "kowiki")
if not os.path.isdir(kowiki_dir):
    os.makedirs(kowiki_dir)
os.listdir(kowiki_dir)

# Vocabulary*

In [8]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

True

# Tutorial

In [9]:
# 입력 문장
sentences = [
    '나는 오늘 기분이 좋아 나는 오늘 우울해',
    '나는 오늘 행복해 나는 오늘 즐거워'
]

In [None]:
# 학습용 입력 데이터 생성
train_ids = []
for sentence in sentences:
    train_ids.append(vocab.encode_as_ids(sentence))

train_ids

In [None]:
# id를 입력과 정답으로 저장
train_inputs, train_labels = [], []
for train_id in train_ids:
    train_inputs.append([vocab.bos_id()] + train_id)
    train_labels.append(train_id + [vocab.eos_id()])

train_inputs, train_labels

In [None]:
# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_inputs:
    row += [0] * (10 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 10)
for row in train_labels:
    row += [0] * (10 - len(row))

train_inputs, train_labels

In [None]:
# train inputs을 numpy array로 변환
train_inputs = np.array(train_inputs)
train_labels = np.array(train_labels)

train_inputs, train_labels

In [None]:
# 단어를 벡터로 변환
embedding = tf.keras.layers.Embedding(len(vocab), 4)
hidden = embedding(train_inputs)  # (bs, n_seq, 4)
hidden

In [None]:
# LSTM 실행
lstm = tf.keras.layers.LSTM(units=3, return_sequences=True)
hidden = lstm(hidden)
hidden

In [None]:
# 다음단어 예측
linear = tf.keras.layers.Dense(len(vocab), activation=tf.nn.softmax)
y_pred = linear(hidden)
y_pred

In [None]:
# CE loss
tf.keras.losses.SparseCategoricalCrossentropy()(train_labels, y_pred)

In [None]:
args.n_vocab = len(vocab)
args.d_model = 32
args

In [19]:
def build_model(args):
    inputs = tf.keras.layers.Input((None,))  # (bs, n_seq)
    # 입력 단어를 vector로 변환
    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden = embedding(inputs)  # (bs, n_seq, d_model)
    # LSTM
    lstm = tf.keras.layers.LSTM(units=args.d_model, return_sequences=True)
    hidden = lstm(hidden)  # (bs, n_seq, d_model * 2)
    # 다음단어 확률 분포
    linear = tf.keras.layers.Dense(args.n_vocab, activation=tf.nn.softmax)
    y_pred = linear(hidden)
    # 학습할 모델 선언
    model = tf.keras.Model(inputs=inputs, outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict(train_inputs)

# 실습
- 아래 데이터를 이용해서 언어모델 프로젝트를 구성해 보세요.

In [22]:
# 입력 문장
sentences = [
    '영화 재미있어 영화 너무 신났어',
    '영화 너무 재미있어 영화 지루하고 너무 재미없어',
]

# Data*

In [23]:
kowiki_file = os.path.join(kowiki_dir, "kowiki.txt.zip")

In [None]:
# wiki 내용 확인
with zipfile.ZipFile(kowiki_file) as z:
    with z.open("kowiki.txt") as f:
        for i, line in enumerate(f):
            if  100 < i:
                break
            line = line.decode('utf-8').strip()
            print(line)

# Loss & Acc*

## Tutorial

In [None]:
# 4개 의 단어를 예측하기 위한 정답과 예측값 가정
# 정답
y_true = np.random.randint(1, 4, (1, 20)).astype(np.float32)
y_true[:, 8:] = 0
# 예측 값
y_pred = np.random.random((1, 20, 4)).astype(np.float32)
y_pred = tf.nn.softmax(y_pred, axis=-1).numpy()  # 확률 값으로 변경

y_true, y_pred

In [None]:
# 기본 loss
loss = sparse_entropy = tf.keras.losses.SparseCategoricalCrossentropy()(y_true, y_pred)
loss

In [None]:
# loss 계산 (각각)
loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(y_true, y_pred)
loss

In [None]:
# 0이면 0, 아니면 1
mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
mask

In [None]:
# mask 부분을 0으로 변경
loss *= mask
loss

In [None]:
# mask를 제외한 나머지 부분의 평균
loss = tf.reduce_sum(loss) / tf.maximum(1., tf.reduce_sum(mask))
loss

## 실습
- 아래 함수를 완료하세요.

In [31]:
def lm_loss(y_true, y_pred):
    """
    pad 부분을 제외하고 loss를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 loss 값
    """
    ##################################
    # 함수를 완성하세요.
    ##################################
    return loss

In [None]:
# lm_loss 함수 확인
loss = lm_loss(y_true, y_pred)
loss

## Tutorial

In [None]:
# 4개 의 단어를 예측하기 위한 정답과 예측값 가정
# 정답
y_true = np.random.randint(1, 4, (1, 20)).astype(np.float32)
y_true[:, 8:] = 0
# 예측 값
y_pred = np.random.random((1, 20, 4)).astype(np.float32)
y_pred = tf.nn.softmax(y_pred, axis=-1).numpy()  # 확률 값으로 변경

y_true, y_pred

In [None]:
y_true = tf.cast(y_true, tf.float32)
y_true

In [None]:
# 예측 class
y_class = tf.cast(tf.argmax(y_pred, axis=-1), tf.float32)
y_class

In [None]:
# 예측값과 정답 비교
matches = tf.cast(tf.equal(y_true, y_class), tf.float32)
matches

In [None]:
# 0이면 0, 아니면 1
mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
mask

In [None]:
# mask 부분을 0으로 변경
matches *= mask
matches

In [None]:
# mask를 제외한 나머지 부분의 accuracy
accuracy = tf.reduce_sum(matches) / tf.maximum(1., tf.reduce_sum(mask))
accuracy

## 실습
- 아래 함수를 완료하세요.

In [40]:
def lm_acc(y_true, y_pred):
    """
    pad 부분을 제외하고 accuracy를 계산하는 함수
    :param y_true: 정답
    :param y_pred: 예측 값
    :retrun loss: pad 부분이 제외된 accuracy 값
    """
    y_true = tf.cast(y_true, tf.float32)

    ##################################
    # 함수를 완성하세요.
    ##################################

    return accuracy

In [None]:
# lm_acc 함수 확인
accuracy = lm_acc(y_true, y_pred)
print(accuracy)

# Small Data Project

## Train 데이터 생성

In [42]:
docs, doc = [], []
# wiki 내용 확인
with zipfile.ZipFile(kowiki_file) as z:
    with z.open("kowiki.txt") as f:
        for i, line in enumerate(f):
            line = line.decode('utf-8').strip()
            if line:
                doc.append(line)
            else:
                if doc:
                    docs.append(doc)
                doc = []
len(docs)

1274146

In [None]:
docs[3]

In [None]:
docs_train = docs[:1]
docs_train

### Tutorial

In [None]:
doc = docs_train[0]
doc

In [None]:
inputs, labels = [], []
n_seq = 128
n_max = n_seq - 1

chunk, chunk_len = [], 0
for i, line in enumerate(doc):
    tokens = vocab.encode_as_ids(line)
    # chunk에 저장
    chunk.append(tokens)
    chunk_len += len(tokens)
    # max length 보다 크거나 마지막 이면
    if n_max <= chunk_len or i >= len(doc) - 1:
        # chunk를 token_ids에 저장
        token_ids = []
        for tokens in chunk:
            token_ids.extend(tokens)
        token_ids = token_ids[:n_max]
        print(len(token_ids), token_ids)
        # input: [BOS] + token_ids
        input_id = [vocab.bos_id()] + token_ids
        input_id += [0] * (n_seq - len(input_id))
        inputs.append(input_id)
        # label: token_ids + [EOS]
        label_id = token_ids + [vocab.eos_id()]
        label_id += [0] * (n_seq - len(label_id))
        labels.append(label_id)
        # chunk clear
        chunk, chunk_len = [], 0

### 실습
- 아래 함수를 완성하세요.

In [53]:
def make_data(docs, vocab, n_seq):
    inputs, labels = [], []
    n_max = n_seq - 1

    ##################################
    # 함수를 완성하세요.
    ##################################

    inputs = np.array(inputs)
    labels = np.array(labels)
    return inputs, labels

In [None]:
train_inputs, train_labels = make_data(docs_train, vocab, 128)
train_inputs, train_labels

## Modeling

In [55]:
args.n_vocab = len(vocab)
args.d_model = 256
args

Namespace(d_model=256, n_vocab=32007, seed=1234)

In [56]:
def build_model(args):
    ##################################
    # 모델을 완성하세요.
    ##################################

    # 학습할 모델 선언
    model = tf.keras.Model(inputs=inputs, outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict(train_inputs[:4])

## Train

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [60]:
model.compile(loss=lm_loss, optimizer="adam", metrics=[lm_acc])

In [61]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='lm_acc', patience=5)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(kowiki_dir, "lm.hdf5"),
                                                  monitor='lm_acc',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(kowiki_dir, "lm.csv"))

In [None]:
history = model.fit(train_inputs, train_labels,
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['lm_acc'], 'g-', label='accuracy')

plt.show()

## 평가

In [64]:
model = build_model(args)
model.load_weights(os.path.join(kowiki_dir, "lm.hdf5"))

In [65]:
model.compile(loss=lm_loss, optimizer="adam", metrics=[lm_acc])

In [66]:
valid_inputs = train_inputs[:100]
valid_labels = train_labels[:100]

In [None]:
model.evaluate(valid_inputs, valid_labels, batch_size=128)

In [None]:
y_pred = model.predict(valid_inputs)
y_pred

In [None]:
y_true = tf.one_hot(valid_labels, args.n_vocab).numpy()
y_true[:, :, 0] = 0
y_true.shape, y_true

In [None]:
y_prob1 = y_true * y_pred
y_prob1.shape, y_prob1

In [None]:
y_prob2 = np.sum(y_prob1, axis=-1)
y_prob2.shape, y_prob2

In [None]:
y_prob2[y_prob2 == 0] = 1
y_prob2.shape, y_prob2

In [None]:
logppl1 = - np.log(y_prob2)
logppl1.shape, logppl1

In [None]:
logppl = np.mean(logppl1)
logppl

In [None]:
ppl = np.exp(logppl)
ppl

## 배포

In [76]:
model = build_model(args)
model.load_weights(os.path.join(kowiki_dir, "lm.hdf5"))

In [77]:
def do_next(vocab, model, string):
    tokens = vocab.encode_as_ids(string)
    start_idx = len(tokens)
    token_id = tokens
    token_id = [vocab.bos_id()] + token_id

    result = model.predict(np.array([token_id]))
    prob = result[0][start_idx]
    max_args = np.argsort(prob)[-10:]
    max_args = list(max_args)
    max_args.reverse()

    next_prob = []
    for i in max_args:
        w = vocab.id_to_piece(int(i))
        p = prob[i]
        next_prob.append((w, p))
    return next_prob

In [None]:
string = '대한민국'
do_next(vocab, model, string)

In [None]:
while True:
    string = input('시작 문장 > ')
    string = string.strip()
    if len(string) == 0:
        break
    next_prob = do_next(vocab, model, string)
    for w, p in next_prob:
        print(f'{w}: {p}')
    print()

In [80]:
def do_generate(vocab, model, n_seq, string):
    n_max = n_seq - 1
    tokens = vocab.encode_as_ids(string)
    start_idx = len(tokens)
    token_id = tokens
    token_id = [vocab.bos_id()] + token_id

    for _ in range(start_idx, n_seq - 1):
        outputs = model.predict(np.array([token_id]))
        prob = outputs[0][start_idx]
        word_id = int(np.random.choice(len(vocab), 1, p=prob)[0])
        # word_id = int(np.argmax(prob))
        if word_id == vocab.eos_id():
            break
        token_id.append(word_id)
        start_idx += 1
    predict_id = token_id[1:start_idx + 1]
    predict_str = vocab.decode_ids(predict_id)
    return predict_str

In [None]:
string = '대한민국'
do_generate(vocab, model, 64, string)

In [None]:
while True:
    string = input('시작 문장 > ')
    string = string.strip()
    if len(string) == 0:
        break
    predict_str = do_generate(vocab, model, 64, string)
    print(predict_str)

# 실습
- 100,000 docs 데이터를 이용해 Language Model을 학습해보세요.

## Train 데이터 생성

## Modeling

## Train

## 평가

## 배포