In [10]:
import pandas as pd
import tensorflow as tf
from transformers import (
    TFBertModel,
    BertConfig,
    BertTokenizer
)


In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer

In [None]:
# 데이터셋 로드
# Step 1: 데이터셋 불러오기
train_data = pd.read_csv("train.csv")
# 컬럼 이름 변경
train_data.drop(['상황키워드'], axis=1, inplace=True)
train_data = train_data.rename(columns={'감정_대분류': 'label', '사람문장': 'document'})
train_data.loc[(train_data['label']=='불안'), 'label'] = 0
train_data.loc[(train_data['label']=='분노'), 'label'] = 1
train_data.loc[(train_data['label']=='상처'), 'label'] = 2 
train_data.loc[(train_data['label']=='슬픔'), 'label'] = 3 
train_data.loc[(train_data['label']=='기쁨'), 'label'] = 4 
train_data = train_data[["document", "label"]]
train_data = train_data.reset_index(drop=True) # 인덱스 리셋

# 라벨링
label_dict = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}
train_data['label'] = train_data['label'].apply(lambda x: label_dict[str(x)])

# tokenizer 준비
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# 입력 데이터 전처리 함수
def preprocess_sentence(sentence, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    return input_id, attention_mask

# 입력 데이터 전처리
MAX_LEN = 50
input_ids = []
attention_masks = []
for sentence in train_data['document']:
    input_id, attention_mask = preprocess_sentence(sentence, MAX_LEN)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)

# 입력 데이터를 numpy array로 변환
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(train_data['label'])

# 모델 생성 및 훈련
bert_model = TFBertModel.from_pretrained('monologg/kobert', from_pt=True)
input_layer = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
attention_layer = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
output_layer = bert_model([input_layer, attention_layer])[1]
output_layer = tf.keras.layers.Dense(5, activation='softmax')(output_layer)
model = tf.keras.models.Model(inputs=[input_layer, attention_layer], outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit([input_ids, attention_masks], labels, epochs=100, batch_size=32)


In [4]:
# 모델 저장
model.save('kobert_sentiment_classification.h5')

In [12]:
from tensorflow.keras.models import load_model

In [16]:
# 모델 불러오기
model = load_model('./model_save/kobert_sentiment_classification.h5', custom_objects={'TFBertModel': TFBertModel})


In [15]:
# Step 1: 데이터셋 불러오기
test_data = pd.read_csv("test.csv")
# 컬럼 이름 변경
test_data.drop(['상황키워드'], axis=1, inplace=True)
test_data = test_data.rename(columns={'감정_대분류': 'label', '사람문장': 'document'})
test_data.loc[(test_data['label']=='불안'), 'label'] = 0
test_data.loc[(test_data['label']=='분노'), 'label'] = 1
test_data.loc[(test_data['label']=='상처'), 'label'] = 2 
test_data.loc[(test_data['label']=='슬픔'), 'label'] = 3 
test_data.loc[(test_data['label']=='기쁨'), 'label'] = 4 
test_data = test_data[["document", "label"]]
test_data = test_data.reset_index(drop=True) # 인덱스 리셋

# 라벨링
label_dict = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}
test_data['label'] = test_data['label'].apply(lambda x: label_dict[str(x)])

# tokenizer 준비
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# 입력 데이터 전처리 함수
def preprocess_sentence(sentence, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    return input_id, attention_mask

# 입력 데이터 전처리
MAX_LEN = 50
input_ids = []
attention_masks = []
for sentence in test_data['document']:
    input_id, attention_mask = preprocess_sentence(sentence, MAX_LEN)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)

# 입력 데이터를 numpy array로 변환
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(test_data['label'])

# 모델 예측
predictions = model.predict([input_ids, attention_masks])
predicted_labels = np.argmax(predictions, axis=1)

# 정확도 계산
accuracy = np.sum(predicted_labels == labels) / len(labels)
print("Accuracy:", accuracy)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Accuracy: 0.9930269980332559
