In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import TFBertModel, BertConfig, BertTokenizer, BertModel, DistilBertModel
from tokenization_kobert import KoBertTokenizer

In [2]:
import transformers
transformers.__version__

'4.26.1'

In [3]:
# Step 1: 데이터셋 불러오기
train_data = pd.read_csv("train_emotion.csv")
train_data.drop(['상황키워드'], axis=1, inplace=True)
train_data = train_data.rename(columns={'감정_대분류': 'label', '사람문장': 'document'})
train_data.loc[(train_data['label']=='불안'), 'label'] = 0
train_data.loc[(train_data['label']=='분노'), 'label'] = 1
train_data.loc[(train_data['label']=='상처'), 'label'] = 2 
train_data.loc[(train_data['label']=='슬픔'), 'label'] = 3 
train_data.loc[(train_data['label']=='기쁨'), 'label'] = 4 
train_data = train_data[["document", "label"]]
train_data = train_data.reset_index(drop=True)

# 라벨링
label_dict = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}
train_data['label'] = train_data['label'].apply(lambda x: label_dict[str(x)])

# tokenizer 준비
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

# 입력 데이터 전처리 함수
def preprocess_sentence(sentence, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    return input_id, attention_mask

# 입력 데이터 전처리
MAX_LEN = 50
input_ids = []
attention_masks = []
for sentence in train_data['document']:
    input_id, attention_mask = preprocess_sentence(sentence, MAX_LEN)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)

# 입력 데이터를 numpy array로 변환
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(train_data['label'])

# 모델 생성 및 훈련
bert_model = TFBertModel.from_pretrained('monologg/kobert', from_pt=True)
input_layer = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
attention_layer = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
output_layer = bert_model([input_layer, attention_layer])[1]
output_layer = tf.keras.layers.Dense(5, activation='softmax')(output_layer)
model = tf.keras.models.Model(inputs=[input_layer, attention_layer], outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit([input_ids, attention_masks], labels, epochs=30, batch_size=32, validation_split=0.2)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.
All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1acf7772bb0>

In [5]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  92186880    ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                            

In [6]:
# 모델 저장
model.save('./model_save/kobert_emotion_epoch.h5')

In [18]:
# Tokenizer 저장
import pickle

with open('./model_save/tokenizer_emotion_epoch.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
from tensorflow.keras.models import load_model

In [19]:
# 모델 불러오기
model = load_model('./model_save/kobert_emotion_30epoch.h5', custom_objects={'TFBertModel': TFBertModel})


In [21]:
# Tokenizer 불러오기
import pickle

with open('./model_save/tokenizer_emotion_30epoch.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from konlpy.tag import Okt
import re, pickle
import numpy as np

class model:
  def __init__(self):
    self.max_len = 50
    self.stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
    self.first_model = load_model('MH-1.0.1.h5')
    self.second_model = load_model('MH-2.0.1.h5')
    with open('tokenizer.pickle', 'rb') as handle:
      self.tokenizer = pickle.load(handle)
    self.okt = Okt()

    

  def sentiment_predict(self, new_sentence):

    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = self.okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in self.stopwords] # 불용어 제거
    encoded = self.tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = self.max_len) # 패딩
    score = float(np.argmax(self.first_model.predict(pad_new), axis=-1)) # 예측
    return score

  def circumstance_predict(self, new_sentence):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = self.okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in self.stopwords] # 불용어 제거
    encoded = self.tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = self.max_len) # 패딩
    score = float(np.argmax(self.second_model.predict(pad_new), axis=-1)) # 예측
    return score

In [22]:
# Step 1: 데이터셋 불러오기
test_data = pd.read_csv("test_emotion.csv")
# 컬럼 이름 변경
test_data.drop(['상황키워드'], axis=1, inplace=True)
test_data = test_data.rename(columns={'감정_대분류': 'label', '사람문장': 'document'})
test_data.loc[(test_data['label']=='불안'), 'label'] = 0
test_data.loc[(test_data['label']=='분노'), 'label'] = 1
test_data.loc[(test_data['label']=='상처'), 'label'] = 2 
test_data.loc[(test_data['label']=='슬픔'), 'label'] = 3 
test_data.loc[(test_data['label']=='기쁨'), 'label'] = 4 
test_data = test_data[["document", "label"]]
test_data = test_data.reset_index(drop=True) # 인덱스 리셋

# 라벨링
label_dict = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}
test_data['label'] = test_data['label'].apply(lambda x: label_dict[str(x)])

# tokenizer 준비
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

# 입력 데이터 전처리 함수
def preprocess_sentence(sentence, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    return input_id, attention_mask

# 입력 데이터 전처리
MAX_LEN = 50
input_ids = []
attention_masks = []
for sentence in test_data['document']:
    input_id, attention_mask = preprocess_sentence(sentence, MAX_LEN)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)

# 입력 데이터를 numpy array로 변환
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(test_data['label'])

# 모델 예측
predictions = model.predict([input_ids, attention_masks])
predicted_labels = np.argmax(predictions, axis=1)

# 정확도 계산
accuracy = np.sum(predicted_labels == labels) / len(labels)
print("Accuracy:", accuracy)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Accuracy: 0.9996424101555516
