# Evn*

In [None]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [None]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
nsmc_dir = os.path.join(data_dir, "nsmc")
if not os.path.isdir(nsmc_dir):
    os.makedirs(nsmc_dir)
os.listdir(nsmc_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Tutorial

In [None]:
# 입력 문장
sentences = [
    '나는 오늘 기분이 좋아',
    '나는 오늘 우울해'
]

# 출력 정답
labels = [1, 0]  # 긍정(1), 부정(0)

In [None]:
# 학습용 입력 데이터 생성
train_inputs = []
for sentence in sentences:
    train_inputs.append(vocab.encode_as_ids(sentence))

# train label
train_labels = labels

# 문장의 길이를 모두 동일하게 변경 (최대길이 5)
for row in train_inputs:
    row += [0] * (5 - len(row))

# train inputs을 numpy array로 변환
train_inputs = np.array(train_inputs)

# 학습용 정답을 numpy array로 변환
train_labels = np.array(train_labels)

train_inputs, train_labels

In [None]:
# 입력 단어를 vector로 변환
embedding = tf.keras.layers.Embedding(len(vocab), 4)
hidden = embedding(train_inputs)
hidden

In [None]:
weight, = embedding.get_weights()
weight.shape

In [None]:
# RNN, CNN

In [None]:
# 각 단어 벡터의 최대값 기준으로 벡터를 더해서 차원을 줄여줌 (문장 vector 생성)
pool = tf.keras.layers.GlobalMaxPool1D()
hidden_pool = pool(hidden)
hidden_pool

In [None]:
# 문장 vector를 이용해서 긍정(1), 부정(0) 확률값 예측
linear = tf.keras.layers.Dense(2, activation=tf.nn.softmax)
y_pred = linear(hidden_pool)
y_pred

In [None]:
weight, bias = linear.get_weights()
weight.shape, bias.shape

In [None]:
# CE loss
tf.keras.losses.SparseCategoricalCrossentropy()(train_labels, y_pred)

In [None]:
# CE loss 직접 계산: y_true
y_true = tf.one_hot(train_labels, 2)
y_true

In [None]:
# CE loss 직접 계산
loss1 = - y_true * tf.math.log(y_pred)
loss1

In [None]:
# CE loss 직접 계산
loss2 = tf.reduce_sum(loss1, axis=-1)
loss2

In [None]:
# CE loss 직접 계산
loss = tf.reduce_mean(loss2)
loss

In [None]:
args.n_vocab = len(vocab)
args.d_model = 32
args.n_out = 2
args

In [None]:
def build_model(args):
    inputs = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden = embedding(inputs)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden = pool(hidden)
    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(hidden)

    model = tf.keras.Model(inputs=inputs, outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict(train_inputs)

# 실습
- 아래 데이터를 이용해서 문장을 긍정/부정으로 분류하는 프로젝트를 구성해보세요.

In [None]:
# 입력 문장
sentences = [
    '영화 너무 재미있어',
    '영화 지루해서 너무 재미없어'
]

# 출력 정답
labels = [1, 0]  # 긍정(1), 부정(0)

# Data*

In [None]:
!wget https://github.com/e9t/nsmc/raw/master/ratings_train.txt
!wget https://github.com/e9t/nsmc/raw/master/ratings_test.txt

In [None]:
id_to_label = {0: "부정", 1: "긍정"}

# EDA

In [None]:
df_train = pd.read_csv('ratings_train.txt', delimiter='\t')
df_train

## char length

In [None]:
# document 길이 데이터
train_length = df_train["document"].astype("str").apply(len)
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=200, range=[0, 200], facecolor='r', label='char')
plt.title('Lengh of char')
plt.xlabel('Number of char')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"char 길이 최대:    {np.max(train_length):4d}")
print(f"char 길이 최소:    {np.min(train_length):4d}")
print(f"char 길이 평균:    {np.mean(train_length):7.2f}")
print(f"char 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"char 25/100분위:  {percentile25:7.2f}")
print(f"char 50/100분위:  {percentile50:7.2f}")
print(f"char 75/100분위:  {percentile75:7.2f}")
print(f"char IQR:        {percentileIQR:7.2f}")
print(f"char MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['char counts'], showmeans=True)
plt.show()

## word length

In [None]:
# document 길이 데이터
train_length = df_train["document"].astype("str").apply(lambda x:len(x.split()))
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=50, range=[0, 50], facecolor='r', label='word')
plt.title('Lengh of word')
plt.xlabel('Number of word')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"word 길이 최대:    {np.max(train_length):4d}")
print(f"word 길이 최소:    {np.min(train_length):4d}")
print(f"word 길이 평균:    {np.mean(train_length):7.2f}")
print(f"word 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"word 25/100분위:  {percentile25:7.2f}")
print(f"word 50/100분위:  {percentile50:7.2f}")
print(f"word 75/100분위:  {percentile75:7.2f}")
print(f"word IQR:        {percentileIQR:7.2f}")
print(f"word MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['word counts'], showmeans=True)
plt.show()

## token length

In [None]:
# document 길이 데이터
train_length = df_train["document"].astype("str").apply(lambda x:len(vocab.encode_as_pieces(x)))
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=150, range=[0, 150], facecolor='r', label='word')
plt.title('Lengh of word')
plt.xlabel('Number of word')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"token 길이 최대:    {np.max(train_length):4d}")
print(f"token 길이 최소:    {np.min(train_length):4d}")
print(f"token 길이 평균:    {np.mean(train_length):7.2f}")
print(f"token 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"token 25/100분위:  {percentile25:7.2f}")
print(f"token 50/100분위:  {percentile50:7.2f}")
print(f"token 75/100분위:  {percentile75:7.2f}")
print(f"token IQR:        {percentileIQR:7.2f}")
print(f"token MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['token counts'], showmeans=True)
plt.show()

## Label Count

In [None]:
label_count = df_train['label'].value_counts()
label_count

In [None]:
# label count
print(f"부정 리뷰 개수: {label_count[0]}")
print(f"긍정 리뷰 개수: {label_count[1]}")

In [None]:
# 한글 설정
plt.rc('font', family="NanumBarunGothic")
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결
plt.bar(["부정", "긍정"], label_count)
plt.show()

## Word Cloud

In [None]:
train_review = [review for review in df_train['document'] if type(review) is str]
train_review[:10]

In [None]:
from wordcloud import WordCloud

In [None]:
" ".join(train_review[:10])

In [None]:
# wordcloud = WordCloud(width=800, height=800, font_path=r"C:\Windows\Fonts\malgun.ttf").generate(" ".join(train_review))
wordcloud = WordCloud(width=800, height=800).generate(" ".join(train_review))
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Small Data Project

## Train, Test 데이터 생성

In [None]:
df_train = pd.read_csv('ratings_train.txt', delimiter='\t')
df_train

In [None]:
df_train = df_train.dropna()
df_train

In [None]:
df_train = df_train.sample(10)
df_train

In [None]:
df_test = pd.read_csv('ratings_test.txt', delimiter='\t')
df_test

In [None]:
df_test = df_test.dropna()
df_test

In [None]:
df_test = df_test.sample(10)
df_test

In [None]:
def make_data(df, vocab, n_seq):
    inputs, labels = [], []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        label = row["label"]
        document = row["document"]
        token = vocab.encode_as_ids(document)[:n_seq]
        token = token + [0] * (n_seq - len(token))
        assert len(token) == n_seq
        print(label, len(token), token)
        inputs.append(token)
        labels.append(label)
    inputs = np.array(inputs)
    labels = np.array(labels)
    return inputs, labels

In [None]:
train_inputs, train_labels = make_data(df_train, vocab, 47)
train_inputs, train_labels

In [None]:
test_inputs, test_labels = make_data(df_test, vocab, 47)
test_inputs, test_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args.n_out = 2
args

In [None]:
def build_model(args):
    inputs = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden = embedding(inputs)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden = pool(hidden)
    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(hidden)

    model = tf.keras.Model(inputs=inputs, outputs=y_pred)
    return model

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict(train_inputs[:4])

In [None]:
model.predict(test_inputs[:4])

## Train

In [None]:
model = build_model(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(nsmc_dir, "nsmc.hdf5"),
                                                  monitor='val_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(nsmc_dir, "nsmc.csv"))

In [None]:
history = model.fit(train_inputs, train_labels,
                    epochs=100,
                    batch_size=64,
                    validation_data=(test_inputs, test_labels),
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')

plt.show()

## 평가

In [None]:
model = build_model(args)
model.load_weights(os.path.join(nsmc_dir, "nsmc.hdf5"))

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.evaluate(test_inputs, test_labels)

In [None]:
y_pred = model.predict(test_inputs)
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)
y_class

In [None]:
cf_matrix = np.zeros((2, 2)).astype(np.int)
cf_matrix

In [None]:
for y_true, y_pred in zip(test_labels, y_class):
    cf_matrix[int(y_true), int(y_pred)] += 1
cf_matrix

In [None]:
tp = cf_matrix[1, 1]
tn = cf_matrix[0, 0]
fp = cf_matrix[0, 1]
fn = cf_matrix[1, 0]

accuracy = (tp + tn) / max((tp + tn + fp + fn), 1)
print(f'accuracy: {accuracy}')
precision = (tp) / max((tp + fp), 1)
print(f'precision: {precision}')
recall = (tp) / max((tp + fn), 1)
print(f'recall: {recall}')
f1 = 2 * (precision * recall) / max((precision + recall), 1)
print(f'f1: {f1}')

## 배포

In [None]:
model = build_model(args)
model.load_weights(os.path.join(nsmc_dir, "nsmc.hdf5"))

In [None]:
string = "영화 너무 심심해"

In [None]:
vocab.encode_as_pieces(string)

In [None]:
infer_input = vocab.encode_as_ids(string)
infer_input = np.array([infer_input])
infer_input

In [None]:
train_inputs.shape, infer_input.shape

In [None]:
y_pred = model.predict(infer_input)
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)[0]
y_class

In [None]:
print(f"{string} : {id_to_label[y_class]}")

In [None]:
def do_predict(model, string):
    infer_input = vocab.encode_as_ids(string)
    infer_input = np.array([infer_input])

    y_pred = model.predict(infer_input)
    y_class = np.argmax(y_pred, axis=-1)[0]
    return id_to_label[y_class]

In [None]:
do_predict(model, string)

In [None]:
while True:
    print("input> ", end="")
    string = str(input())
    if len(string) == 0:
        break
    result = do_predict(model, string)
    print(result)

# 실습
- 전체 데이터를 이용해 nsmc를 학습해보세요.

## Train, Test 데이터 생성

## Modeling

## Train

## 평가

## 배포

# Feature

In [None]:
feature_inputs = train_inputs[:1000]
feature_labels = train_labels[:1000]
feature_inputs, feature_labels

In [None]:
model.summary()

In [None]:
feature_input = model.input
feature_input

In [None]:
feature_output = model.get_layer("global_max_pooling1d_2").output
feature_output

In [None]:
feature_model = tf.keras.Model(inputs=feature_input, outputs=feature_output)

In [None]:
y_feature = feature_model.predict(feature_inputs)
y_feature.shape

In [None]:
from sklearn.decomposition import PCA

In [None]:
def plot_feature(feature, labels):
    plt.figure(figsize=(8, 8))
    
    f_pos = PCA(n_components=2).fit_transform(feature)
    
    for pos, label in zip(f_pos, labels):
        plt.scatter(pos[0], pos[1], color="green" if label == 0 else "orange")
    
    plt.show()

In [None]:
plot_feature(y_feature, feature_labels)