## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read Files

In [None]:
final_data=pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv')

print(final_data.head())
print()
# Total 51,630
final_data.info()

## Normalize Text by removing English, Numbers, and other stuffs.

In [None]:
final_data['문장']

In [None]:
final_data['문장'].str.contains('[^가-힣 ]') # 공백과 한글 이외에도 있는 단어들일 경우 True 출력

In [None]:
final_data[final_data['문장'].str.contains('[^가-힣 ]')].values[:10] # '문장' 컬럼의 내용중에 영문, 특수문자 있는지 확인 : 영문과 특수문자 존재 확인 (period, comma, 느낌표, 마침표, 물음표, ...)

In [None]:
final_data['문장'] = final_data['문장'].str.replace(pat=r'[^가-힣 ]', repl=r'', regex=True) # '문장' 컬럼의 내용에서 숫자, 영문자, 특수문자등의 글자는 삭제처리
final_data['문장'][final_data['문장'].str.contains('[^가-힣 ]')].sum()

In [None]:
final_data.head()

In [None]:
final_data.tail()

## Preprocessing : Removing Null, duplications

In [None]:
final_data['문장'] = final_data['문장'].str.strip()

final_data.tail()

In [None]:
print(final_data.isnull().sum())
print()
print(final_data['문장'].duplicated().sum())

In [None]:
final_data.drop_duplicates(subset=['문장'], inplace=True)

final_data.info() # 51,630 -->  51,574 (51,630 - 56)

## Check label disrtribution

In [None]:
print(final_data['감정'].value_counts())

final_data['감정'].value_counts().plot(kind='bar')

## Encode label as numbers

In [None]:
# 감정 리스트 만듬
list1 = final_data['감정'].value_counts().index.values
print(list1)

# 라벨와 클래스을 매핑 작업
label2class = dict()
class2label = dict()
for cl, la in enumerate(list1):
  # print(i, j)
  label2class[la] = cl
  class2label[cl] = la

print(label2class)
print(class2label)

In [None]:
# '감정' 라벨링 수행
final_data['label'] = final_data['감정'].map(label2class)

final_data.tail()

## 7. X, Y 분리

In [None]:
# X, Y 분리
features = final_data['문장'].values
labels = final_data['label'].values

features.shape, labels.shape

In [None]:
# features 내용 3개 출력
print(features[:3])

print(f'Max length of event word arrays : {max(len(l) for l in features):d}')
print(f'Avg length of event word arrays : {sum(map(len, features))/len(features):.4f}')

In [None]:
plt.hist([len(s) for s in features], bins=50)

plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

## Split train set & test set

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    features, 
    labels , 
    test_size=0.2, 
    stratify=labels, 
    random_state=41,
)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
# 샘플확인 , 라벨 확인
# {0: '불안', 1: '분노', 2: '상처', 3: '슬픔', 4: '당황', 5: '기쁨'}

x_train[:2], y_train[:2]

## 9. 전체 문장에 대해 Tokenizing
+ 컴퓨터가 이해하기 위해 모든 단어를 숫자로 변환해야 함.
+ 단어 빈도수 따지지 않고 무조건 모든 단어 수용해서 진행

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenizer 구현 : 단어 사전 만들기(fit_on_texts)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
# 단어에 대한 숫자 매핑
print(tokenizer.word_index)

# 반대로 숫자로 단어 매핑
print(tokenizer.index_word)

# 단어별 빈도수 확인
print(tokenizer.word_counts)

# 총 단어 갯수 : 47,646
print(len(tokenizer.index_word))

In [None]:
# 문장을 숫자로 나열
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

print(len(x_train_seq), len(x_test_seq))

In [None]:
print(x_train[1:3])
print(x_train_seq[1:3])

## Padding Sequence

In [None]:
# 문장의 최대 길이 파악 : 제일 긴 문장 seq 길이는 38개로 구성됨.
print(max(len(line) for line in x_train_seq))

x_train_pad = pad_sequences(x_train_seq, maxlen=38)
x_test_pad = pad_sequences(x_test_seq, maxlen=38)

In [None]:
# 문장 Seq 내용을 보니 잘 패딩되어 있음 확인
x_train_pad[:1]

In [None]:
# 문장 Seq 패딩의 shape 확인
x_train_pad.shape, x_test_pad.shape

## LSTM Modeling

In [None]:
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool2D
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Hyperparameters
max_words = 47646 + 1    # 총 단어 갯수 + padding 0 번호
embedding_dim = 32       # embedding 차원
max_len = 38             # 최대 문장 길이
n_epoches = 50
batch_size = 512
validation_rate = 0.2

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Flatten())
model.add(Dense(128, activation='swish'))
model.add(Dense(32, activation='swish'))
model.add(Dense(6, activation='softmax'))
model.summary()

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

checkpoint_path = 'bilstm_checkpoint.keras'
cp = ModelCheckpoint(
    checkpoint_path, 
    monitor='val_loss', 
    verbose=1, 
    save_best_only=True,
)

history = model.fit(
    x_train_pad,
    y_train,
    epochs=n_epoches,
    batch_size=batch_size,
    validation_split=validation_rate,
    verbose =1,
    callbacks=[es, cp],
)

In [None]:
epochs = range(1, len(history.history['accuracy']) + 1)
plt.plot(epochs, history.history['accuracy'])
plt.plot(epochs, history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], )
plt.show()

### Evaluation

In [None]:
model.evaluate(x_test_pad, y_test)

### Prediction

In [None]:
print(f'문자열 : {x_test[0]}')
print(f'Sequence : {x_test_pad[0]}')

predict = model.predict(x_test_pad[:1])

print(f'True : {class2label[y_test[0]]}')
print(f'Predict : {class2label[np.argmax(predict)]}')