In [None]:
# src/classifier_{조}.ipynb

# [1] 필요한 라이브러리
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from model.classifier_model import KoBertClassifier
from sklearn.metrics import f1_score
import os

# [2] 설정
MODEL_NAME = 'monologg/kobert'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_LEN = 64
BATCH_SIZE = 16
NUM_LABELS = 5

# [3] 데이터셋 클래스
class QuestionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.data = data

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item['question'], padding='max_length', truncation=True,
            max_length=MAX_LEN, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': item.get('label', -1)
        }

    def __len__(self):
        return len(self.data)

# [4] 데이터 로딩
with open("../data/train.json", "r") as f:
    train_data = json.load(f)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_dataset = QuestionDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# [5] 모델 정의
model = KoBertClassifier(MODEL_NAME, NUM_LABELS).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# [6] 학습 루프
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# [7] 테스트셋 로딩 및 예측
with open("../data/test_cls.json", "r") as f:
    test_data = json.load(f)
test_dataset = QuestionDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model.eval()
results = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        pred = torch.argmax(outputs, dim=1).item()

    results.append({
        "question": tokenizer.decode(input_ids[0], skip_special_tokens=True),
        "label": pred
    })

# [8] JSON 결과 저장
os.makedirs("../outputs", exist_ok=True)
with open("../outputs/cls_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)


: 