# GoEmotions: 다중 감정 분류 모델 학습

이 노트북은 GoEmotions 데이터셋을 사용하여 Reddit 댓글의 다중 감정(27가지)을 분류하는 모델을 학습하고 평가합니다.

## 1. 환경 설정 및 라이브러리 임포트

In [None]:
!pip install -q datasets transformers[torch] scikit-learn pandas numpy matplotlib seaborn tensorboard

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from collections import Counter

from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, multilabel_confusion_matrix
from huggingface_hub import notebook_login

In [None]:
# Hugging Face Hub 로그인 (필요시)
# notebook_login()

## 2. 데이터셋 준비

In [None]:
# GoEmotions 데이터셋 로드
dataset = load_dataset("go_emotions", "raw") # 'raw' configuration for original multi-label data

print(dataset)
print(dataset['train'][0])

In [None]:
# 라벨 정보 확인
labels_list = dataset["train"].features["labels"].feature.names
id2label = {idx: label for idx, label in enumerate(labels_list)}
label2id = {label: idx for idx, label in enumerate(labels_list)}
num_labels = len(labels_list)

print(f"Total labels: {num_labels}")
print(f"Label names: {labels_list}")

### 2.0. 데이터셋 레이블 분포 시각화 (추가)

In [None]:
# 전체 데이터셋 (train, validation, test 합쳐서) 레이블 분포 확인
all_labels_flat = []
for split in dataset.keys():
    for example_labels in dataset[split]['labels']:
        all_labels_flat.extend(example_labels)

label_counts = Counter(all_labels_flat)
df_label_counts = pd.DataFrame([(id2label[id], count) for id, count in label_counts.items()], columns=['Emotion', 'Count']).sort_values(by='Count', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Count', y='Emotion', data=df_label_counts, palette='viridis')
plt.title('GoEmotions Dataset: Label Distribution (All Splits)')
plt.xlabel('Number of Occurrences')
plt.ylabel('Emotion')
plt.tight_layout()
plt.show()

### 2.1. 데이터 전처리 및 토큰화

In [None]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def preprocess_data(examples):
    # 텍스트 토큰화
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    
    # 레이블을 multi-hot encoding으로 변환
    multi_hot_labels = []
    for label_list_item in examples["labels"]:
        one_hot_vector = [0.0] * num_labels # float 타입으로 초기화
        for label_id in label_list_item:
            if 0 <= label_id < num_labels:
                one_hot_vector[label_id] = 1.0
        multi_hot_labels.append(one_hot_vector)
    
    tokenized_inputs["labels"] = multi_hot_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=["text", "id", "caller_id", "linked_id", "created_utc", "comment_id"])
tokenized_datasets.set_format("torch")

print(tokenized_datasets['train'][0])
print(tokenized_datasets['train'][0]['labels'].shape)
print(tokenized_datasets['train'][0]['input_ids'].shape)

In [None]:
# 데이터셋 크기 줄이기 (빠른 테스트용, 필요시 주석 해제)
# print("Original dataset sizes:")
# print({split: len(tokenized_datasets[split]) for split in tokenized_datasets})

# train_subset_size = 1000 
# val_subset_size = 200
# test_subset_size = 200

# tokenized_datasets_subset = DatasetDict({
#     'train': tokenized_datasets['train'].shuffle(seed=42).select(range(train_subset_size)),
#     'validation': tokenized_datasets['validation'].shuffle(seed=42).select(range(val_subset_size)),
#     'test': tokenized_datasets['test'].shuffle(seed=42).select(range(test_subset_size))
# })
# print("\nSubset dataset sizes:")
# print({split: len(tokenized_datasets_subset[split]) for split in tokenized_datasets_subset})
# tokenized_datasets = tokenized_datasets_subset # 실제 학습에 사용할 데이터셋을 부분집합으로 교체

## 3. 모델 선택 및 학습

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=num_labels, 
    problem_type="multi_label_classification", # 다중 레이블 분류 문제 명시
    id2label=id2label,
    label2id=label2id
)

# GPU 사용 가능 여부 확인 및 모델 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

### 3.1. 학습 설정 및 메트릭 정의

In [None]:
# 다중 레이블 분류를 위한 메트릭 함수
def compute_metrics_multilabel(eval_pred):
    logits, true_labels = eval_pred
    # 로짓에 시그모이드 함수 적용
    probs = torch.sigmoid(torch.Tensor(logits)).numpy()
    # 임계값(0.5)을 기준으로 예측 레이블 결정 (0 또는 1)
    predictions = (probs > 0.5).astype(int)
    
    true_labels = true_labels.astype(int)

    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)
    subset_accuracy = accuracy_score(true_labels, predictions)
    
    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'f1_weighted': f1_weighted,
        'subset_accuracy': subset_accuracy
    }

In [None]:
BATCH_SIZE = 16 
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3 

training_args = TrainingArguments(
    output_dir="./results_goemotions",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    logging_dir="./logs_goemotions",
    logging_strategy="epoch", # 로깅 전략을 epoch으로 변경
    load_best_model_at_end=True, 
    metric_for_best_model="f1_macro", 
    greater_is_better=True,
    save_total_limit=2, 
    fp16=torch.cuda.is_available(), 
    report_to="tensorboard", 
    push_to_hub=False 
)

### 3.2. Trainer 정의 및 모델 학습

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_multilabel,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] 
)

In [None]:
# TensorBoard 실행 (Colab 또는 로컬 터미널에서)
# %load_ext tensorboard
# %tensorboard --logdir logs_goemotions

In [None]:
train_result = trainer.train()

### 3.3. 학습 과정 시각화 (추가)

In [None]:
log_history = trainer.state.log_history

df_log = pd.DataFrame(log_history)

# 학습 손실과 검증 손실 분리
train_loss = df_log[df_log['loss'].notna()][['epoch', 'loss']]
eval_loss = df_log[df_log['eval_loss'].notna()][['epoch', 'eval_loss']]
eval_metrics = df_log[df_log['eval_f1_macro'].notna()]

plt.figure(figsize=(15, 5))

# 손실 시각화
plt.subplot(1, 2, 1)
if not train_loss.empty:
    plt.plot(train_loss['epoch'], train_loss['loss'], label='Training Loss', marker='o')
if not eval_loss.empty:
    plt.plot(eval_loss['epoch'], eval_loss['eval_loss'], label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# F1 Macro 시각화 (또는 다른 주요 평가지표)
plt.subplot(1, 2, 2)
if not eval_metrics.empty:
    plt.plot(eval_metrics['epoch'], eval_metrics['eval_f1_macro'], label='Validation F1 Macro', marker='o', color='green')
    # 다른 평가지표 추가 가능 (예: eval_subset_accuracy)
    if 'eval_subset_accuracy' in eval_metrics.columns:
      plt.plot(eval_metrics['epoch'], eval_metrics['eval_subset_accuracy'], label='Validation Subset Accuracy', marker='x', color='purple', linestyle='--')
plt.title('Validation Metrics')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 4. 모델 평가 및 분석

In [None]:
# 테스트 데이터셋으로 평가
eval_results = trainer.evaluate(tokenized_datasets["test"])
print("\nTest Set Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# 테스트 데이터셋으로 예측
predictions_output = trainer.predict(tokenized_datasets["test"])
logits = predictions_output.predictions
true_labels_test = predictions_output.label_ids # 변수명 변경 true_labels -> true_labels_test

probs = torch.sigmoid(torch.Tensor(logits)).numpy()
predicted_labels_test = (probs > 0.5).astype(int) # 변수명 변경 predicted_labels -> predicted_labels_test

true_labels_test = true_labels_test.astype(int)

### 4.1. 분류 보고서

In [None]:
print("\nGoEmotions Multi-label Classification Report (Test Set):")
print(classification_report(true_labels_test, predicted_labels_test, target_names=labels_list, zero_division=0))

### 4.2. 혼동 행렬 (각 레이블별)

In [None]:
# 각 레이블에 대한 혼동 행렬 시각화
mcm = multilabel_confusion_matrix(true_labels_test, predicted_labels_test)

def plot_confusion_matrix_multilabel(mcm_data, current_labels_list, cols=3):
    rows = (len(current_labels_list) + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))
    axes = axes.flatten() 
    for i, label_name in enumerate(current_labels_list):
        if i < len(mcm_data):
            cm = mcm_data[i]
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                        xticklabels=['Not ' + label_name, label_name],
                        yticklabels=['Not ' + label_name, label_name])
            axes[i].set_title(f'CM for: {label_name}')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('True')
        else:
            axes[i].axis('off') 
    plt.tight_layout()
    plt.show()

plot_confusion_matrix_multilabel(mcm, labels_list, cols=4)

## 5. 샘플 예측

In [None]:
def predict_emotion(text, model_to_predict, tokenizer_to_use, current_id2label, threshold=0.5):
    model_to_predict.eval()
    current_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_to_predict.to(current_device)

    inputs = tokenizer_to_use(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(current_device) for k, v in inputs.items()}

    with torch.no_grad():
        logits_output = model_to_predict(**inputs).logits
    
    probs_output = torch.sigmoid(logits_output).squeeze().cpu().numpy()
    
    predicted_label_ids = np.where(probs_output > threshold)[0]
    predicted_emotions_list = [current_id2label[idx] for idx in predicted_label_ids]
    
    return predicted_emotions_list, probs_output

In [None]:
sample_texts = [
    "I am so happy and excited about the new project!",
    "This is really frustrating and annoying.",
    "I'm not sure how I feel about this, a bit confused and curious.",
    "Thank you so much, I really appreciate your help.",
    "The movie was incredibly sad, but also very beautiful."
]

for text_item in sample_texts:
    predicted_emotions, _ = predict_emotion(text_item, model, tokenizer, id2label, threshold=0.3) 
    print(f"\nSample Text: '{text_item}'")
    if predicted_emotions:
        print(f"Predicted Emotions: {', '.join(predicted_emotions)}")
    else:
        print("Predicted Emotions: No emotion above threshold (or 'neutral' if available and predicted)")

## 6. 모델 저장 (선택 사항)

In [None]:
# 학습된 모델과 토크나이저 저장
output_model_dir = "./saved_model_goemotions"
os.makedirs(output_model_dir, exist_ok=True)

if hasattr(trainer, 'model'): # trainer가 모델을 가지고 있는지 확인
    trainer.save_model(output_model_dir)
    tokenizer.save_pretrained(output_model_dir)
    print(f"Model and tokenizer saved to {output_model_dir}")
else:
    print("Trainer does not have a model to save. Was training completed?")

### 모델 로드 및 사용 예시 (저장된 모델)

In [None]:
if os.path.exists(output_model_dir) and os.path.exists(os.path.join(output_model_dir, 'pytorch_model.bin')):
    loaded_model = AutoModelForSequenceClassification.from_pretrained(output_model_dir)
    loaded_tokenizer = AutoTokenizer.from_pretrained(output_model_dir)

    sample_text_for_loading_test = "I feel a bit nervous but also excited."
    predicted_emotions_loaded, _ = predict_emotion(sample_text_for_loading_test, loaded_model, loaded_tokenizer, id2label, threshold=0.3)

    print(f"\nSample Text (loaded model): '{sample_text_for_loading_test}'")
    if predicted_emotions_loaded:
        print(f"Predicted Emotions: {', '.join(predicted_emotions_loaded)}")
    else:
        print("Predicted Emotions: No emotion above threshold")
else:
    print(f"Skipping loading example: Model not found at {output_model_dir}")