In [3]:
# 코드 -> 텍스트 매핑 생성
map_age_band = {'A01': '청소년', 'A02': '청년', 'A03': '중년', 'A04': '노년'}
map_gender = {'G01': '남성', 'G02': '여성'}

# 상황 매핑
map_situation = {
    "S01": "가족관계",
    "S02": "학업 및 진로",
    "S03": "학교폭력/따돌림",
    "S04": "대인관계",
    "S05": "연애,결혼,출산",
    "S06": "진로,취업,직장",
    "S07": "대인관계(부부, 자녀)",
    "S08": "재정,은퇴,노후준비",
    "S09": "건강",
    "S10": "직장, 업무 스트레스",
    "S11": "건강,죽음",
    "S12": "대인관계(노년)",
    "S13": "재정",
    "S14": "기타"
}

# 질병 매핑
map_disease = {
    "D01": "만성질환 유",
    "D02": "만성질환 무"
}

# 감정 매핑
map_emotion = {
    "E10": "분노",
    "E20": "슬픔",
    "E30": "불안",
    "E60": "기쁨"
}

map_code = {
    'age_band': map_age_band,
    'gender': map_gender,
    'situation': map_situation,
    'disease': map_disease,
    'emotion': map_emotion
}

def decode(code_type, code):
     return map_code[code_type][code]

import json
import pandas as pd

dataset = pd.read_json('../dataset.json')
validation_dataset = pd.read_json('../filtered_validation_dataset.json')

filtered_dataset = dataset # 필터링 생략

flattened_profile = pd.json_normalize(filtered_dataset['profile'])
flattened_talk = pd.json_normalize(filtered_dataset['talk'])

combine = flattened_profile.join(flattened_talk)
df = pd.DataFrame(combine)

# 감정 레이블 간소화
df["emotion_type"] = df["emotion.type"].str[:2] + "0"

# 6가지 카테고리로 간소화
df["emotion_type"] = df["emotion_type"].apply(lambda x: 'E20' if (x.strip() == 'E40') else x) # 정확도를 위해 4가지로 축소(상처 -> 슬픔)
df["emotion_type"] = df["emotion_type"].apply(lambda x: 'E30' if (x.strip() == 'E50') else x) # 정확도를 위해 4가지로 축소(당황 -> 불안)
df['emotion_label'] = df['emotion_type'].astype('category').cat.codes
label_mapping = dict(enumerate(df['emotion_type'].astype('category').cat.categories))
label_to_code_map = {value: key for key, value in label_mapping.items()}

# print(label_to_code_map)

val_df = pd.DataFrame(validation_dataset)
val_df["emotion_type"] = val_df["emotion.type"].apply(lambda x: 'E20' if (x.strip() == 'E40') else x) # 정확도를 위해 4가지로 축소(상처 -> 슬픔)
val_df["emotion_type"] = val_df["emotion_type"].apply(lambda x: 'E30' if (x.strip() == 'E50') else x) # 정확도를 위해 4가지로 축소(당황 -> 불안)
val_df['emotion_label'] = val_df['emotion_type'].apply(lambda x: label_to_code_map[x])

df = df.drop(columns=[
    'persona-id',
    'persona.persona-id',
    'persona.computer',
    'emotion.emotion-id',
    'id.profile-id',
    'id.talk-id',
    'content.SS01',
    'content.SS02',
    'content.SS03'
    ], axis=1)

In [4]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_scheduler

# 사전 학습된 모델과 토크나이저 로드
model_name = "klue/roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# 데이터셋 토크나이징
tokenized_datasets = df.map(tokenize_function, batched=True)

# DataLoader 생성
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습률 스케줄러 설정
num_training_steps = len(train_dataloader) * 3  # Epoch 수
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

from torch import nn
from torch.optim import AdamW
from tqdm import tqdm

# 모델 학습 모드
model.train()

# 손실 함수
loss_fn = nn.CrossEntropyLoss()

# 학습 루프
progress_bar = tqdm(range(num_training_steps))
for epoch in range(3):  # Epoch 수
    for batch in train_dataloader:
        # 입력 데이터 준비
        inputs = {key: val.to(device) for key, val in batch.items() if key in tokenizer.model_input_names}
        
        # Forward Pass
        outputs = model(**inputs)
        loss = outputs.loss

        # Backward Pass 및 옵티마이저 스텝
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        # 진행 상태 업데이트
        progress_bar.update(1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: tokenize_function() got an unexpected keyword argument 'batched'

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in eval_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items() if key in tokenizer.model_input_names}
        outputs = model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)
        correct += (predictions == batch["labels"]).sum().item()
        total += len(batch["labels"])

accuracy = correct / total
print(f"Accuracy: {accuracy:.2f}")