In [24]:
# 코드 -> 텍스트 매핑 생성
map_age_band = {'A01': '청소년', 'A02': '청년', 'A03': '중년', 'A04': '노년'}
map_gender = {'G01': '남성', 'G02': '여성'}

# 상황 매핑
map_situation = {
    "S01": "가족관계",
    "S02": "학업 및 진로",
    "S03": "학교폭력/따돌림",
    "S04": "대인관계",
    "S05": "연애,결혼,출산",
    "S06": "진로,취업,직장",
    "S07": "대인관계(부부, 자녀)",
    "S08": "재정,은퇴,노후준비",
    "S09": "건강",
    "S10": "직장, 업무 스트레스",
    "S11": "건강,죽음",
    "S12": "대인관계(노년)",
    "S13": "재정",
    "S14": "기타"
}

# 질병 매핑
map_disease = {
    "D01": "만성질환 유",
    "D02": "만성질환 무"
}

# 감정 매핑
map_emotion = {
    "E10": "분노",
    "E20": "슬픔",
    "E30": "불안",
    # "E40": "상처",
    # "E50": "당황",
    "E60": "기쁨",
    # "E10": "분노",
    # "E11": "툴툴대는",
    # "E12": "좌절한",
    # "E13": "짜증내는",
    # "E14": "방어적인",
    # "E15": "악의적인",
    # "E16": "안달하는",
    # "E17": "구역질 나는",
    # "E18": "노여워하는",
    # "E19": "성가신",
    # "E20": "슬픔",
    # "E21": "실망한",
    # "E22": "비통한",
    # "E23": "후회되는",
    # "E24": "우울한",
    # "E25": "마비된",
    # "E26": "염세적인",
    # "E27": "눈물이 나는",
    # "E28": "낙담한",
    # "E29": "환멸을 느끼는",
    # "E30": "불안",
    # "E31": "두려운",
    # "E32": "스트레스 받는",
    # "E33": "취약한",
    # "E34": "혼란스러운",
    # "E35": "당혹스러운",
    # "E36": "회의적인",
    # "E37": "걱정스러운",
    # "E38": "조심스러운",
    # "E39": "초조한",
    # "E40": "상처",
    # "E41": "질투하는",
    # "E42": "배신당한",
    # "E43": "고립된",
    # "E44": "충격 받은",
    # "E45": "가난한, 불우한",
    # "E46": "희생된",
    # "E47": "억울한",
    # "E48": "괴로워하는",
    # "E49": "버려진",
    # "E50": "당황",
    # "E51": "고립된(당황한)",
    # "E52": "남의 시선을 의식하는",
    # "E53": "외로운",
    # "E54": "열등감",
    # "E55": "죄책감의",
    # "E56": "부끄러운",
    # "E57": "혐오스러운",
    # "E58": "한심한",
    # "E59": "혼란스러운(당황한)",
    # "E60": "기쁨",
    # "E61": "감사하는",
    # "E62": "신뢰하는",
    # "E63": "편안한",
    # "E64": "만족스러운",
    # "E65": "흥분",
    # "E66": "느긋",
    # "E67": "안도",
    # "E68": "신이 난",
    # "E69": "자신하는"
}

map_code = {
    'age_band': map_age_band,
    'gender': map_gender,
    'situation': map_situation,
    'disease': map_disease,
    'emotion': map_emotion
}

def decode(code_type, code):
     return map_code[code_type][code]

import json
import pandas as pd

dataset = pd.read_json('./dataset.json')
validation_dataset = pd.read_json('./filtered_validation_dataset.json')

In [25]:
########################################
# 데이터 전처리
########################################
# 연령(profile.persona.human[0]) - [A01: 청소년, A02: 청년, A03: 중년, A04: 노년] 
# 성별(profile.persona.human[1]) - [G01: 남성, G02: 여성]

# 'human' 리스트에 'A01'이 포함된 레코드 필터링
# filtered_dataset = dataset[
#     dataset['profile'].apply(lambda x: 'A01' in x['persona']['human'])
# ]

# 필터링 생략
filtered_dataset = dataset

flattened_profile = pd.json_normalize(filtered_dataset['profile'])
flattened_talk = pd.json_normalize(filtered_dataset['talk'])

combine = flattened_profile.join(flattened_talk)
df = pd.DataFrame(combine)

# 감정 레이블 간소화
df["emotion_type"] = df["emotion.type"].str[:2] + "0"

# 6가지 카테고리로 간소화
df["emotion_type"] = df["emotion_type"].apply(lambda x: 'E20' if (x.strip() == 'E40') else x) # 정확도를 위해 4가지로 축소(상처 -> 슬픔)
df["emotion_type"] = df["emotion_type"].apply(lambda x: 'E30' if (x.strip() == 'E50') else x) # 정확도를 위해 4가지로 축소(당황 -> 불안)
df['emotion_label'] = df['emotion_type'].astype('category').cat.codes
label_mapping = dict(enumerate(df['emotion_type'].astype('category').cat.categories))
label_to_code_map = {value: key for key, value in label_mapping.items()}

# print(label_to_code_map)

val_df = pd.DataFrame(validation_dataset)
val_df["emotion_type"] = val_df["emotion.type"].apply(lambda x: 'E20' if (x.strip() == 'E40') else x) # 정확도를 위해 4가지로 축소(상처 -> 슬픔)
val_df["emotion_type"] = val_df["emotion_type"].apply(lambda x: 'E30' if (x.strip() == 'E50') else x) # 정확도를 위해 4가지로 축소(당황 -> 불안)
val_df['emotion_label'] = val_df['emotion_type'].apply(lambda x: label_to_code_map[x])

# 긍정 0, 부정 1로 간소화
# df["emotion_label"] = df["emotion_type"].apply(lambda x: 0 if x == 'E60' else 1) # 긍/부정 두가지 감정으로 범주화
# label_mapping = { 0: '긍정', 1: '부정' }
# label_to_code_map = { '긍정': 0, '부정': 1}

In [26]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# 1. 데이터 로드 및 전처리
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = f"[CLS]{decode('situation', row['emotion.situation'][0])}[SEP]{decode('age_band', row['persona.human'][0])}[SEP]{row['content.HS01']} {row['content.HS02']} {row['content.HS03']}".strip()
        label = row['emotion_label']

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 2. 모델 및 토크나이저 로드
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "klue/roberta-base"
# MODEL_NAME = "klue/roberta-small"
# MODEL_NAME = "klue/roberta-large"
# MODEL_NAME = "beomi/kcbert-base"
# MODEL_NAME = "skt/kobert-base-v1"
# MODEL_NAME = "beomi/KcELECTRA-base"
# MODEL_NAME = "mental/mental-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 데이터셋 생성
MAX_LEN = 128

### 별도의 데이터셋 사용
train_dataset = Dataset(df, tokenizer, MAX_LEN)
val_dataset = Dataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
# 별도의 데이터셋 사용

### 학습 셋을 나눠서 사용
# data_train, data_val = train_test_split(df, test_size=0.2, random_state=42)
# train_dataset = Dataset(data_train, tokenizer, MAX_LEN)
# val_dataset = Dataset(data_val, tokenizer, MAX_LEN)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=32)
# 학습 셋을 나눠서 사용

# 3. 모델 초기화
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=df['emotion_label'].nunique(),
    hidden_dropout_prob=0.3,  # Dropout 비율 증가
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 4. 손실 함수 및 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(df['emotion_label']), 
    y=df['emotion_label']
)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = torch.nn.CrossEntropyLoss(weight=weights)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# 5. 학습 루프
from transformers import get_scheduler

EPOCHS = 5
num_training_steps = len(train_loader) * EPOCHS
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if lr_scheduler:
            lr_scheduler.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return total_loss / len(data_loader), classification_report(true_labels, predictions)


In [28]:
train_losses = []
val_losses = []

# 6. 학습 및 평가 실행
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_report = eval_model(model, val_loader, criterion, device)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(val_report)



Epoch 1/5
Train Loss: 0.7729
Validation Loss: 0.8890
              precision    recall  f1-score   support

           0       0.71      0.44      0.54       516
           1       0.59      0.74      0.65       553
           2       0.64      0.68      0.66       493
           3       0.65      0.97      0.78        62

    accuracy                           0.63      1624
   macro avg       0.65      0.71      0.66      1624
weighted avg       0.65      0.63      0.62      1624

Epoch 2/5
Train Loss: 0.6310
Validation Loss: 0.9054
              precision    recall  f1-score   support

           0       0.67      0.54      0.60       516
           1       0.63      0.71      0.66       553
           2       0.68      0.67      0.67       493
           3       0.61      0.97      0.75        62

    accuracy                           0.65      1624
   macro avg       0.64      0.72      0.67      1624
weighted avg       0.65      0.65      0.65      1624

Epoch 3/5
Train Loss: 0.

In [18]:
# 7. 모델 저장
torch.save(model.state_dict(), 'EXP1_RoBERTa.pth')
