# 질문 분류기 학습 및 예측

## 필요한 데이터
1. `data/train.json`: 학습 데이터
   ```json
   [
       {
           "question": "질문 내용",
           "label": 0  // 0-4 사이의 정수
       },
       ...
   ]
   ```
2. `data/test_cls.json`: 테스트 데이터
   ```json
   [
       {
           "question": "질문 내용"
       },
       ...
   ]
   ```

## 환경 설정
필요한 패키지 설치:
```bash
pip install transformers torch scikit-learn sentencepiece
```

In [None]:
# [1] 필요한 라이브러리
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import sys
import os
from pathlib import Path

# 프로젝트 루트 디렉토리를 Python 경로에 추가
current_dir = Path(os.getcwd()).absolute()
project_root = current_dir.parent
sys.path.append(str(project_root))
print(f"Added {project_root} to Python path")

from model.classifier_model import KoBertClassifier
from sklearn.metrics import f1_score

# [2] 설정
MODEL_NAME = 'monologg/kobert'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_LEN = 64
BATCH_SIZE = 16
NUM_LABELS = 5

print(f"Using device: {DEVICE}")

# [3] 데이터셋 클래스
class QuestionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.data = data

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item['question'], padding='max_length', truncation=True,
            max_length=MAX_LEN, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': item.get('label', -1)
        }

    def __len__(self):
        return len(self.data)

# [4] 데이터 로딩
try:
    with open("../data/train.json", "r", encoding='utf-8') as f:
        train_data = json.load(f)
    
    if not train_data:
        raise ValueError("train.json is empty")
        
    print(f"Loaded {len(train_data)} training examples")
    
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = QuestionDataset(train_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    # [5] 모델 정의
    model = KoBertClassifier(MODEL_NAME, NUM_LABELS).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # [6] 학습 루프
    for epoch in range(3):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

    # [7] 테스트셋 로딩 및 예측
    try:
        with open("../data/test_cls.json", "r", encoding='utf-8') as f:
            test_data = json.load(f)
            
        if not test_data:
            raise ValueError("test_cls.json is empty")
            
        print(f"Loaded {len(test_data)} test examples")
        
        test_dataset = QuestionDataset(test_data, tokenizer)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        model.eval()
        results = []
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
                pred = torch.argmax(outputs, dim=1).item()

            results.append({
                "question": tokenizer.decode(input_ids[0], skip_special_tokens=True),
                "label": pred
            })

        # [8] JSON 결과 저장
        os.makedirs("../outputs", exist_ok=True)
        with open("../outputs/cls_output.json", "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
            
        print(f"Saved predictions for {len(results)} examples to cls_output.json")
            
    except FileNotFoundError:
        print("Error: test_cls.json file not found")
    except json.JSONDecodeError:
        print("Error: test_cls.json is not a valid JSON file")
    except ValueError as e:
        print(f"Error: {str(e)}")
    except Exception as e:
        print(f"Unexpected error during testing: {str(e)}")
        
except FileNotFoundError:
    print("Error: train.json file not found")
except json.JSONDecodeError:
    print("Error: train.json is not a valid JSON file")
except ValueError as e:
    print(f"Error: {str(e)}")
except Exception as e:
    print(f"Unexpected error during training: {str(e)}")

NameError: name '__file__' is not defined