# Evn*

In [None]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [None]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
sts_dir = os.path.join(data_dir, "kosts")
if not os.path.isdir(sts_dir):
    os.makedirs(sts_dir)
os.listdir(sts_dir)

# Vocabulary*

In [None]:
# vocab loading
vocab = spm.SentencePieceProcessor()
vocab.load(os.path.join(data_dir, 'kowiki', 'kowiki_32000.model'))

# Tutorial

In [None]:
# 입력 문장
sentences = [
    ['나는 오늘 기분이 좋아', '나는 오늘 우울해'],
    ['나는 오늘 행복해', '나는 오늘 즐거워'],
]

# 출력 정답
labels = [0, 1]  # 같음(1), 다름(0)

In [None]:
# 학습용 입력 데이터 생성
train_inputs_1, train_inputs_2 = [], []
for pair in sentences:
    train_inputs_1.append(vocab.encode_as_ids(pair[0]))
    train_inputs_2.append(vocab.encode_as_ids(pair[1]))

# train label
train_labels = labels

train_inputs_1, train_inputs_2, labels

In [None]:
# 문장의 길이를 모두 동일하게 변경 (최대길이 5)
for row in train_inputs_1:
    row += [0] * (5 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 4)
for row in train_inputs_2:
    row += [0] * (4 - len(row))

train_inputs_1, train_inputs_2, labels

In [None]:
# train inputs을 numpy array로 변환
train_inputs_1 = np.array(train_inputs_1)
train_inputs_2 = np.array(train_inputs_2)

# 학습용 정답을 numpy array로 변환
train_labels = np.array(train_labels)

train_inputs_1, train_inputs_2, train_labels

In [None]:
# 입력 단어를 vector로 변환
embedding = tf.keras.layers.Embedding(len(vocab), 4)
hidden_1 = embedding(train_inputs_1)  # (bs, n_seq_1, 5)
hidden_2 = embedding(train_inputs_2)  # (bs, n_seq_2, 5)
hidden_1, hidden_2

In [None]:
# RNN, CNN

In [None]:
# 각 단어 벡터의 최대값 기준으로 벡터를 더해서 차원을 줄여줌 (문장 vector 생성)
pool = tf.keras.layers.GlobalMaxPool1D()
hidden_pool_1 = pool(hidden_1)  # (bs, 5)
hidden_pool_2 = pool(hidden_2)  # (bs, 5)
hidden_pool_1, hidden_pool_2

In [None]:
# distance
distance = hidden_pool_1 - hidden_pool_2
distance

In [None]:
# 문장 vector를 이용해서 긍정(1), 부정(0) 확률값 예측
linear = tf.keras.layers.Dense(2, activation=tf.nn.softmax)
y_pred = linear(distance)
y_pred

In [None]:
weight, bias = linear.get_weights()
weight.shape, bias.shape

In [None]:
# CE loss
tf.keras.losses.SparseCategoricalCrossentropy()(train_labels, y_pred)

In [None]:
args.n_vocab = len(vocab)
args.d_model = 32
args.n_out = 2
args

In [None]:
def build_model_type1(args):
    inputs_1 = tf.keras.layers.Input((None,))
    inputs_2 = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden_1 = embedding(inputs_1)
    hidden_2 = embedding(inputs_2)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden_1 = pool(hidden_1)
    hidden_2 = pool(hidden_2)
    distance = hidden_1 - hidden_2

    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(distance)

    model = tf.keras.Model(inputs=(inputs_1, inputs_2), outputs=y_pred)
    return model

In [None]:
model = build_model_type1(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_inputs_1, train_inputs_2))

# 실습
- 아래 데이터를 이용해서 두 문장이 같은지 여부를 추론하는 프로젝트를 구성해 보세요.

In [None]:
# 입력 문장
sentences = [
    ['영화 재미있어', '영화 너무 신났어'],
    ['영화 너무 재미있어', '영화 지루하고 너무 재미없어'],
]

# 출력 정답
labels = [0, 1]  # 같음(1), 다름(0)

# Data*

In [None]:
!wget https://github.com/kakaobrain/KorNLUDatasets/raw/master/KorSTS/sts-train.tsv
!wget https://github.com/kakaobrain/KorNLUDatasets/raw/master/KorSTS/sts-dev.tsv
!wget https://github.com/kakaobrain/KorNLUDatasets/raw/master/KorSTS/sts-test.tsv

In [None]:
id_to_label = {0: "다른 질문", 1: "같은 질문"}

In [None]:
def make_binalry_data(df, upper=3.0, lower=2.0):
    df = df.copy()

    df.loc[df['score'] >= 3.0, 'label'] = 1
    df.loc[df['score'] <= 2.0, 'label'] = 0

    nan_cnt = df['label'].isnull().sum()
    print(f'{nan_cnt} rows dropped')
    df = df.dropna()

    return df

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train = make_binalry_data(df_train)
df_train

# EDA

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train

In [None]:
# data 합치기
q_train = pd.Series(df_train['sentence1'].tolist() + df_train['sentence2'].tolist()).astype(str)
q_train.head(10)

## char length

In [None]:
# document 길이 데이터
train_length = q_train.apply(len)
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=100, range=[0, 100], facecolor='r', label='char')
plt.title('Lengh of char')
plt.xlabel('Number of char')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"char 길이 최대:    {np.max(train_length):4d}")
print(f"char 길이 최소:    {np.min(train_length):4d}")
print(f"char 길이 평균:    {np.mean(train_length):7.2f}")
print(f"char 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"char 25/100분위:  {percentile25:7.2f}")
print(f"char 50/100분위:  {percentile50:7.2f}")
print(f"char 75/100분위:  {percentile75:7.2f}")
print(f"char IQR:        {percentileIQR:7.2f}")
print(f"char MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['char counts'], showmeans=True)
plt.show()

## word length

In [None]:
# document 길이 데이터
train_length = q_train.apply(lambda x:len(x.split()))
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=40, range=[0, 40], facecolor='r', label='word')
plt.title('Lengh of word')
plt.xlabel('Number of word')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"word 길이 최대:    {np.max(train_length):4d}")
print(f"word 길이 최소:    {np.min(train_length):4d}")
print(f"word 길이 평균:    {np.mean(train_length):7.2f}")
print(f"word 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"word 25/100분위:  {percentile25:7.2f}")
print(f"word 50/100분위:  {percentile50:7.2f}")
print(f"word 75/100분위:  {percentile75:7.2f}")
print(f"word IQR:        {percentileIQR:7.2f}")
print(f"word MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['word counts'], showmeans=True)
plt.show()

## token length

In [None]:
# document 길이 데이터
train_length = q_train.apply(lambda x:len(vocab.encode_as_pieces(x)))
train_length.head(10)

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(train_length, bins=50, range=[0, 50], facecolor='r', label='word')
plt.title('Lengh of word')
plt.xlabel('Number of word')
plt.ylabel('Count of review')
plt.show()

In [None]:
# 데이터 길이
print(f"token 길이 최대:    {np.max(train_length):4d}")
print(f"token 길이 최소:    {np.min(train_length):4d}")
print(f"token 길이 평균:    {np.mean(train_length):7.2f}")
print(f"token 길이 표준편차: {np.std(train_length):7.2f}")

In [None]:
percentile25 = np.percentile(train_length, 25)
percentile50 = np.percentile(train_length, 50)
percentile75 = np.percentile(train_length, 75)
percentileIQR = percentile75 - percentile25
percentileMAX = percentile75 + percentileIQR * 1.5
print(f"token 25/100분위:  {percentile25:7.2f}")
print(f"token 50/100분위:  {percentile50:7.2f}")
print(f"token 75/100분위:  {percentile75:7.2f}")
print(f"token IQR:        {percentileIQR:7.2f}")
print(f"token MAX/100분위: {percentileMAX:7.2f}")

In [None]:
plt.figure(figsize=(4, 6))
plt.boxplot(train_length, labels=['token counts'], showmeans=True)
plt.show()

## Label Count

In [None]:
df_train.loc[df_train['score'] >= 3.0, 'label'] = 1
df_train.loc[df_train['score'] <= 2.0, 'label'] = 0
df_train['label'] = df_train['label'].fillna(2)
df_train

In [None]:
label_count = df_train['label'].value_counts()
label_count

In [None]:
# label count
print(f"같은 질문 개수: {label_count[0]}")
print(f"다른 질문 개수: {label_count[1]}")

In [None]:
# 한글 설정
plt.rc('font', family="NanumBarunGothic")
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결
plt.bar(["같음", "다름", "NaN"], label_count)
plt.show()

## Word Cloud

In [None]:
train_sentence = [sentence for sentence in q_train if type(sentence) is str]
train_sentence[:10]

In [None]:
from wordcloud import WordCloud

In [None]:
" ".join(train_sentence[:10])

In [None]:
# wordcloud = WordCloud(width=800, height=800, font_path=r"C:\Windows\Fonts\malgun.ttf").generate(" ".join(train_sentence))
wordcloud = WordCloud(width=800, height=800).generate(" ".join(train_sentence))
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Small Data Project (Type1)

## Train, Test 데이터 생성

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train = make_binalry_data(df_train)
df_train

In [None]:
df_train = df_train.sample(10)
df_train

In [None]:
df_dev = pd.read_csv('sts-dev.tsv', delimiter='\t', error_bad_lines=False)
df_dev = df_dev.dropna()
df_dev = make_binalry_data(df_dev)
df_dev

In [None]:
df_dev = df_dev.sample(10)
df_dev

In [None]:
df_test = pd.read_csv('sts-test.tsv', delimiter='\t', error_bad_lines=False)
df_test = df_test.dropna()
df_test = make_binalry_data(df_test)
df_test

In [None]:
df_test = df_test.sample(10)
df_test

In [None]:
def oversample(df):
    n_max = df['label'].value_counts().max()
    df_list = [df]
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_max - len(df_group), replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def undersample(df):
    n_min = df['label'].value_counts().min()
    df_list = []
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_min, replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def make_data(df, vocab, n_seq):
    inputs_1, inputs_2, labels = [], [], []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        label = row["label"]
        sentence1 = row["sentence1"]
        token1 = vocab.encode_as_ids(sentence1)[:n_seq]
        token1 = token1 + [0] * (n_seq - len(token1))
        sentence2 = row["sentence2"]
        token2 = vocab.encode_as_ids(sentence2)[:n_seq]
        token2 = token2 + [0] * (n_seq - len(token2))

        assert len(token1) == n_seq
        assert len(token2) == n_seq

        print(label, len(token1), token1, len(token2), token2)
        inputs_1.append(token1)
        inputs_2.append(token2)
        labels.append(label)

    inputs_1 = np.array(inputs_1)
    inputs_2 = np.array(inputs_2)
    labels = np.array(labels)
    return inputs_1, inputs_2, labels

In [None]:
train_inputs_1, train_inputs_2, train_labels = make_data(df_train, vocab, 25)
train_inputs_1, train_inputs_2, train_labels

In [None]:
dev_inputs_1, dev_inputs_2, dev_labels = make_data(df_dev, vocab, 25)
dev_inputs_1, dev_inputs_2, dev_labels

In [None]:
test_inputs_1, test_inputs_2, test_labels = make_data(df_test, vocab, 25)
test_inputs_1, test_inputs_2, test_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args.n_out = 2
args

In [None]:
def build_model_type1(args):
    inputs_1 = tf.keras.layers.Input((None,))
    inputs_2 = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden_1 = embedding(inputs_1)
    hidden_2 = embedding(inputs_2)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden_1 = pool(hidden_1)
    hidden_2 = pool(hidden_2)
    distance = hidden_1 - hidden_2

    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(distance)

    model = tf.keras.Model(inputs=(inputs_1, inputs_2), outputs=y_pred)
    return model

In [None]:
model = build_model_type1(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_inputs_1[:4], train_inputs_2[:4]))

In [None]:
model.predict((test_inputs_1[:4], test_inputs_2[:4]))

## Train

In [None]:
model = build_model_type1(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(sts_dir, "type1.hdf5"),
                                                  monitor='val_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(sts_dir, "type1.csv"))

In [None]:
history = model.fit((train_inputs_1, train_inputs_2), train_labels,
                    epochs=100,
                    batch_size=64,
                    validation_data=((dev_inputs_1, dev_inputs_2), dev_labels),
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')

plt.show()

## 평가

In [None]:
model = build_model_type1(args)
model.load_weights(os.path.join(sts_dir, "type1.hdf5"))

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.evaluate((test_inputs_1, test_inputs_2), test_labels)

In [None]:
y_pred = model.predict((test_inputs_1, test_inputs_2))
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)
y_class

In [None]:
cf_matrix = np.zeros((2, 2)).astype(np.int)
cf_matrix

In [None]:
for y_true, y_pred in zip(test_labels, y_class):
    cf_matrix[int(y_true), int(y_pred)] += 1
cf_matrix

In [None]:
tp = cf_matrix[1, 1]
tn = cf_matrix[0, 0]
fp = cf_matrix[0, 1]
fn = cf_matrix[1, 0]

accuracy = (tp + tn) / max((tp + tn + fp + fn), 1)
print(f'accuracy: {accuracy}')
precision = (tp) / max((tp + fp), 1)
print(f'precision: {precision}')
recall = (tp) / max((tp + fn), 1)
print(f'recall: {recall}')
f1 = 2 * (precision * recall) / max((precision + recall), 1)
print(f'f1: {f1}')

## 배포

In [None]:
model = build_model_type1(args)
model.load_weights(os.path.join(sts_dir, "type1.hdf5"))

In [None]:
string1 = "영화 너무 심심해"
string2 = "영화 너무 재밌어"

In [None]:
vocab.encode_as_pieces(string1), vocab.encode_as_pieces(string2)

In [None]:
infer_input_1 = vocab.encode_as_ids(string1)
infer_input_1 = np.array([infer_input_1])
infer_input_1

In [None]:
infer_input_2 = vocab.encode_as_ids(string2)
infer_input_2 = np.array([infer_input_2])
infer_input_2

In [None]:
y_pred = model.predict((infer_input_1, infer_input_2))
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)[0]
y_class

In [None]:
print(f"{string1} / {string2} : {id_to_label[y_class]}")

In [None]:
def do_predict(model, string1, string2):
    infer_input_1 = vocab.encode_as_ids(string1)
    infer_input_1 = np.array([infer_input_1])

    infer_input_2 = vocab.encode_as_ids(string2)
    infer_input_2 = np.array([infer_input_2])

    y_pred = model.predict((infer_input_1, infer_input_2))
    y_class = np.argmax(y_pred, axis=-1)[0]
    return id_to_label[y_class]

In [None]:
do_predict(model, string1, string2)

In [None]:
while True:
    print("input 1> ", end="")
    string1 = str(input())
    if len(string1) == 0:
        break
    print("input 2> ", end="")
    string2 = str(input())
    if len(string2) == 0:
        break
    result = do_predict(model, string1, string2)
    print(result)

# 실습
- 전체 데이터를 이용해 type1을 학습해보세요.

## Train, Test 데이터 생성

## Modeling

## Train

## 평가

## 배포

# Small Data Project (Type2)

## Train, Test 데이터 생성

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train = make_binalry_data(df_train)
df_train

In [None]:
df_train = df_train.sample(10)
df_train

In [None]:
df_dev = pd.read_csv('sts-dev.tsv', delimiter='\t', error_bad_lines=False)
df_dev = df_dev.dropna()
df_dev = make_binalry_data(df_dev)
df_dev

In [None]:
df_dev = df_dev.sample(10)
df_dev

In [None]:
df_test = pd.read_csv('sts-test.tsv', delimiter='\t', error_bad_lines=False)
df_test = df_test.dropna()
df_test = make_binalry_data(df_test)
df_test

In [None]:
df_test = df_test.sample(10)
df_test

In [None]:
def oversample(df):
    n_max = df['label'].value_counts().max()
    df_list = [df]
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_max - len(df_group), replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def undersample(df):
    n_min = df['label'].value_counts().min()
    df_list = []
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_min, replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def make_data(df, vocab, n_seq):
    inputs_1, inputs_2, labels = [], [], []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        label = row["label"]
        sentence1 = row["sentence1"]
        token1 = vocab.encode_as_ids(sentence1)[:n_seq]
        token1 = token1 + [0] * (n_seq - len(token1))
        sentence2 = row["sentence2"]
        token2 = vocab.encode_as_ids(sentence2)[:n_seq]
        token2 = token2 + [0] * (n_seq - len(token2))

        assert len(token1) == n_seq
        assert len(token2) == n_seq

        # print(label, len(token1), token1, len(token2), token2)
        inputs_1.append(token1)
        inputs_2.append(token2)
        labels.append(label)

    inputs_1 = np.array(inputs_1)
    inputs_2 = np.array(inputs_2)
    labels = np.array(labels)
    return inputs_1, inputs_2, labels

In [None]:
train_inputs_1, train_inputs_2, train_labels = make_data(df_train, vocab, 25)
train_inputs_1, train_inputs_2, train_labels

In [None]:
dev_inputs_1, dev_inputs_2, dev_labels = make_data(df_dev, vocab, 25)
dev_inputs_1, dev_inputs_2, dev_labels

In [None]:
test_inputs_1, test_inputs_2, test_labels = make_data(df_test, vocab, 25)
test_inputs_1, test_inputs_2, test_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args.n_out = 2
args

In [None]:
def build_model_type2(args):
    inputs_1 = tf.keras.layers.Input((None,))
    inputs_2 = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden_1 = embedding(inputs_1)
    hidden_2 = embedding(inputs_2)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden_1 = pool(hidden_1)
    hidden_2 = pool(hidden_2)
    hidden = tf.concat([hidden_1, hidden_2], axis=-1)

    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(hidden)

    model = tf.keras.Model(inputs=(inputs_1, inputs_2), outputs=y_pred)
    return model

In [None]:
model = build_model_type2(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_inputs_1[:4], train_inputs_2[:4]))

In [None]:
model.predict((test_inputs_1[:4], test_inputs_2[:4]))

## Train

In [None]:
model = build_model_type2(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(sts_dir, "type2.hdf5"),
                                                  monitor='val_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(sts_dir, "type2.csv"))

In [None]:
history = model.fit((train_inputs_1, train_inputs_2), train_labels,
                    epochs=100,
                    batch_size=64,
                    validation_data=((dev_inputs_1, dev_inputs_2), dev_labels),
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')

plt.show()

## 평가

In [None]:
model = build_model_type2(args)
model.load_weights(os.path.join(sts_dir, "type2.hdf5"))

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.evaluate((test_inputs_1, test_inputs_2), test_labels)

In [None]:
y_pred = model.predict((test_inputs_1, test_inputs_2))
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)
y_class

In [None]:
cf_matrix = np.zeros((2, 2)).astype(np.int)
cf_matrix

In [None]:
for y_true, y_pred in zip(test_labels, y_class):
    cf_matrix[int(y_true), int(y_pred)] += 1
cf_matrix

In [None]:
tp = cf_matrix[1, 1]
tn = cf_matrix[0, 0]
fp = cf_matrix[0, 1]
fn = cf_matrix[1, 0]

accuracy = (tp + tn) / max((tp + tn + fp + fn), 1)
print(f'accuracy: {accuracy}')
precision = (tp) / max((tp + fp), 1)
print(f'precision: {precision}')
recall = (tp) / max((tp + fn), 1)
print(f'recall: {recall}')
f1 = 2 * (precision * recall) / max((precision + recall), 1)
print(f'f1: {f1}')

## 배포

In [None]:
model = build_model_type2(args)
model.load_weights(os.path.join(sts_dir, "type2.hdf5"))

In [None]:
def do_predict(model, string1, string2):
    infer_input_1 = vocab.encode_as_ids(string1)
    infer_input_1 = np.array([infer_input_1])

    infer_input_2 = vocab.encode_as_ids(string2)
    infer_input_2 = np.array([infer_input_2])

    y_pred = model.predict((infer_input_1, infer_input_2))
    y_class = np.argmax(y_pred, axis=-1)[0]
    return id_to_label[y_class]

In [None]:
while True:
    print("input 1> ", end="")
    string1 = str(input())
    if len(string1) == 0:
        break
    print("input 2> ", end="")
    string2 = str(input())
    if len(string2) == 0:
        break
    result = do_predict(model, string1, string2)
    print(result)

# 실습
- 전체 데이터를 이용해 type2를 학습해보세요.

## Train, Test 데이터 생성

## Modeling

## Train

## 평가

## 배포

# Small Data Project (Type3)

## Train, Test 데이터 생성

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train = make_binalry_data(df_train)
df_train

In [None]:
df_train = df_train.sample(10)
df_train

In [None]:
df_dev = pd.read_csv('sts-dev.tsv', delimiter='\t', error_bad_lines=False)
df_dev = df_dev.dropna()
df_dev = make_binalry_data(df_dev)
df_dev

In [None]:
df_dev = df_dev.sample(10)
df_dev

In [None]:
df_test = pd.read_csv('sts-test.tsv', delimiter='\t', error_bad_lines=False)
df_test = df_test.dropna()
df_test = make_binalry_data(df_test)
df_test

In [None]:
df_test = df_test.sample(10)
df_test

In [None]:
def oversample(df):
    n_max = df['label'].value_counts().max()
    df_list = [df]
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_max - len(df_group), replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def undersample(df):
    n_min = df['label'].value_counts().min()
    df_list = []
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_min, replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def make_data(df, vocab, n_seq):
    inputs, labels = [], []
    sep_id = vocab.piece_to_id("[SEP]")
    n_max = n_seq - 1

    for i, row in tqdm(df.iterrows(), total=len(df)):
        label = row["label"]
        sentence1 = row["sentence1"]
        token1 = vocab.encode_as_ids(sentence1)[:n_max]
        sentence2 = row["sentence2"]
        token2 = vocab.encode_as_ids(sentence2)[:n_max]

        while len(token1) + len(token2) > n_max:
            if len(token1) > len(token2):
                token1.pop()
            else:
                token2.pop()

        token = token1 + [sep_id] + token2
        token = token[:n_seq]
        token = token + [0] * (n_seq - len(token))

        assert len(token) == n_seq

        print(label, len(token), token)
        inputs.append(token)
        labels.append(label)

    inputs = np.array(inputs)
    labels = np.array(labels)
    return inputs, labels

In [None]:
train_inputs, train_labels = make_data(df_train, vocab, 55)
train_inputs, train_labels

In [None]:
dev_inputs, dev_labels = make_data(df_dev, vocab, 55)
dev_inputs, dev_labels

In [None]:
test_inputs, test_labels = make_data(df_test, vocab, 55)
test_inputs, test_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args.n_out = 2
args

In [None]:
def build_model_type3(args):
    inputs = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden = embedding(inputs)
    #
    # RNN, CNN, Attention, Linear
    #
    pool = tf.keras.layers.GlobalMaxPooling1D()
    hidden = pool(hidden)

    linear = tf.keras.layers.Dense(args.n_out, activation=tf.nn.softmax)
    y_pred = linear(hidden)

    model = tf.keras.Model(inputs=(inputs), outputs=y_pred)
    return model

In [None]:
model = build_model_type3(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_inputs[:4]))

In [None]:
model.predict((test_inputs[:4]))

## Train

In [None]:
model = build_model_type3(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(sts_dir, "type3.hdf5"),
                                                  monitor='val_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(sts_dir, "type3.csv"))

In [None]:
history = model.fit(train_inputs, train_labels,
                    epochs=100,
                    batch_size=64,
                    validation_data=(dev_inputs, dev_labels),
                    callbacks=[early_stopping, save_weights, csv_logger])

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')

plt.show()

## 평가

In [None]:
model = build_model_type3(args)
model.load_weights(os.path.join(sts_dir, "type3.hdf5"))

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.evaluate(test_inputs, test_labels)

In [None]:
y_pred = model.predict(test_inputs)
y_pred

In [None]:
y_class = np.argmax(y_pred, axis=-1)
y_class

In [None]:
cf_matrix = np.zeros((2, 2)).astype(np.int)
cf_matrix

In [None]:
for y_true, y_pred in zip(test_labels, y_class):
    cf_matrix[int(y_true), int(y_pred)] += 1
cf_matrix

In [None]:
tp = cf_matrix[1, 1]
tn = cf_matrix[0, 0]
fp = cf_matrix[0, 1]
fn = cf_matrix[1, 0]

accuracy = (tp + tn) / max((tp + tn + fp + fn), 1)
print(f'accuracy: {accuracy}')
precision = (tp) / max((tp + fp), 1)
print(f'precision: {precision}')
recall = (tp) / max((tp + fn), 1)
print(f'recall: {recall}')
f1 = 2 * (precision * recall) / max((precision + recall), 1)
print(f'f1: {f1}')

## 배포

In [None]:
model = build_model_type3(args)
model.load_weights(os.path.join(sts_dir, "type3.hdf5"))

In [None]:
def do_predict(model, string1, string2):
    infer_input_1 = vocab.encode_as_ids(string1)
    infer_input_2 = vocab.encode_as_ids(string2)
    infer_input = infer_input_1 + [vocab.piece_to_id("[SEP]")] + infer_input_2
    infer_input = np.array([infer_input])

    y_pred = model.predict(infer_input)
    y_class = np.argmax(y_pred, axis=-1)[0]
    return id_to_label[y_class]

In [None]:
while True:
    print("input 1> ", end="")
    string1 = str(input())
    if len(string1) == 0:
        break
    print("input 2> ", end="")
    string2 = str(input())
    if len(string2) == 0:
        break
    result = do_predict(model, string1, string2)
    print(result)

# 실습
- 전체 데이터를 이용해 type3을 학습해보세요.

## Train, Test 데이터 생성

## Modeling

## Train

## 평가

## 배포

# MaLSTM

## Train, Test 데이터 생성

In [None]:
df_train = pd.read_csv('sts-train.tsv', delimiter='\t', error_bad_lines=False)
df_train = df_train.dropna()
df_train = make_binalry_data(df_train)
df_train

In [None]:
df_dev = pd.read_csv('sts-dev.tsv', delimiter='\t', error_bad_lines=False)
df_dev = df_dev.dropna()
df_dev = make_binalry_data(df_dev)
df_dev

In [None]:
df_test = pd.read_csv('sts-test.tsv', delimiter='\t', error_bad_lines=False)
df_test = df_test.dropna()
df_test = make_binalry_data(df_test)
df_test

In [None]:
def oversample(df):
    n_max = df['label'].value_counts().max()
    df_list = [df]
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_max - len(df_group), replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def undersample(df):
    n_min = df['label'].value_counts().min()
    df_list = []
    for i_class, df_group in df.groupby('label'):
        print(i_class, len(df_group))
        df_list.append(df_group.sample(n_min, replace=True))
    df_new = pd.concat(df_list)
    return df_new

In [None]:
def make_data(df, vocab, n_seq):
    inputs_1, inputs_2, labels = [], [], []

    for i, row in tqdm(df.iterrows(), total=len(df)):
        label = row["label"]
        sentence1 = row["sentence1"]
        token1 = vocab.encode_as_ids(sentence1)[:n_seq]
        token1 = token1 + [0] * (n_seq - len(token1))
        sentence2 = row["sentence2"]
        token2 = vocab.encode_as_ids(sentence2)[:n_seq]
        token2 = token2 + [0] * (n_seq - len(token2))

        assert len(token1) == n_seq
        assert len(token2) == n_seq

        # print(label, len(token1), token1, len(token2), token2)
        inputs_1.append(token1)
        inputs_2.append(token2)
        labels.append(label)

    inputs_1 = np.array(inputs_1)
    inputs_2 = np.array(inputs_2)
    labels = np.array(labels)
    return inputs_1, inputs_2, labels

In [None]:
train_inputs_1, train_inputs_2, train_labels = make_data(df_train, vocab, 25)
train_inputs_1, train_inputs_2, train_labels

In [None]:
dev_inputs_1, dev_inputs_2, dev_labels = make_data(df_dev, vocab, 25)
dev_inputs_1, dev_inputs_2, dev_labels

In [None]:
test_inputs_1, test_inputs_2, test_labels = make_data(df_test, vocab, 25)
test_inputs_1, test_inputs_2, test_labels

## Modeling

In [None]:
args.n_vocab = len(vocab)
args.d_model = 256
args.n_out = 2
args

In [None]:
def build_model_malstm(args):
    inputs_1 = tf.keras.layers.Input((None,))
    inputs_2 = tf.keras.layers.Input((None,))

    embedding = tf.keras.layers.Embedding(args.n_vocab, args.d_model)
    hidden_1 = embedding(inputs_1)
    hidden_2 = embedding(inputs_2)

    lstm = tf.keras.layers.LSTM(units=args.d_model, go_backwards=True)
    hidden_1 = lstm(hidden_1)
    hidden_2 = lstm(hidden_2)

    # distance 계산 ||a - b||
    distance = K.sum(K.abs(hidden_1 - hidden_2), axis=-1)
    y_pred = K.exp(-distance)

    model = tf.keras.Model(inputs=(inputs_1, inputs_2), outputs=y_pred)
    return model

In [None]:
model = build_model_malstm(args)
tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.predict((train_inputs_1[:4], train_inputs_2[:4]))

In [None]:
model.predict((test_inputs_1[:4], test_inputs_2[:4]))

## Train

In [None]:
model = build_model_malstm(args)
# tf.keras.utils.plot_model(model, 'model.png', show_shapes=True)

In [None]:
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["binary_accuracy"])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5)
save_weights = tf.keras.callbacks.ModelCheckpoint(os.path.join(sts_dir, "malstm.hdf5"),
                                                  monitor='val_binary_accuracy',
                                                  verbose=1,
                                                  save_best_only=True,
                                                  mode="max",
                                                  save_freq="epoch",
                                                  save_weights_only=True)
csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(sts_dir, "malstm.csv"))

In [None]:
history = model.fit((train_inputs_1, train_inputs_2), train_labels,
                    epochs=50,
                    batch_size=512,
                    validation_data=((dev_inputs_1, dev_inputs_2), dev_labels),
                    callbacks=[early_stopping, save_weights, csv_logger],
                    class_weight={0: 2, 1: 1})

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['binary_accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_binary_accuracy'], 'k--', label='val_accuracy')

plt.show()

## 평가

In [None]:
model = build_model_malstm(args)
model.load_weights(os.path.join(sts_dir, "malstm.hdf5"))

In [None]:
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["binary_accuracy"])

In [None]:
model.evaluate((test_inputs_1, test_inputs_2), test_labels)

In [None]:
y_pred = model.predict((test_inputs_1, test_inputs_2))
y_pred

In [None]:
y_class = (y_pred > 0.5).astype(np.int)
y_class

In [None]:
cf_matrix = np.zeros((2, 2)).astype(np.int)
cf_matrix

In [None]:
for y_true, y_pred in zip(test_labels, y_class):
    cf_matrix[int(y_true), int(y_pred)] += 1
cf_matrix

In [None]:
tp = cf_matrix[1, 1]
tn = cf_matrix[0, 0]
fp = cf_matrix[0, 1]
fn = cf_matrix[1, 0]

accuracy = (tp + tn) / max((tp + tn + fp + fn), 1)
print(f'accuracy: {accuracy}')
precision = (tp) / max((tp + fp), 1)
print(f'precision: {precision}')
recall = (tp) / max((tp + fn), 1)
print(f'recall: {recall}')
f1 = 2 * (precision * recall) / max((precision + recall), 1)
print(f'f1: {f1}')

## 배포

In [None]:
model = build_model_malstm(args)
model.load_weights(os.path.join(sts_dir, "malstm.hdf5"))

In [None]:
def do_predict(model, string1, string2):
    infer_input_1 = vocab.encode_as_ids(string1)
    infer_input_1 = np.array([infer_input_1])

    infer_input_2 = vocab.encode_as_ids(string2)
    infer_input_2 = np.array([infer_input_2])

    y_pred = model.predict((infer_input_1, infer_input_2))
    y_class = (y_pred > 0.5)[0]
    y_class = 1 if y_class else 0
    return id_to_label[y_class]

In [None]:
while True:
    print("input 1> ", end="")
    string1 = str(input())
    if len(string1) == 0:
        break
    print("input 2> ", end="")
    string2 = str(input())
    if len(string2) == 0:
        break
    result = do_predict(model, string1, string2)
    print(result)