# Phase 5.1: Evaluate Korean Medical Capabilities

Evaluate the model on Korean medical benchmarks.

## Contents
1. Setup
2. Load Model
3. Evaluate on KorMedMCQA
4. Qualitative Evaluation
5. Save Results

In [None]:
# Setup
import sys
import os
sys.path.append("..")

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_from_disk, load_dataset
from tqdm import tqdm
import json
import re

# GPU setup
from config.gpu_utils import setup_gpu, print_memory_usage
device = setup_gpu()

print_memory_usage()

In [None]:
# Directories
# Primary: Use instruction-tuned model
MODEL_DIR = "../models/instruction_tuned"

# Alternative: Use expanded model directly (before instruction tuning)
# MODEL_DIR = "../models/final/korean_medgemma_expanded"

# Legacy (non-expanded):
# MODEL_DIR = "../models/final/korean_medgemma"

EVAL_DATA_DIR = "../data/processed/kormedmcqa_eval"
RESULTS_DIR = "../results"

os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Model: {MODEL_DIR}")
print(f"Results: {RESULTS_DIR}")

---
## 1. Load Model

In [None]:
# Load model
print("Loading model...")

# For inference, load in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.eval()

print(f"Model loaded!")
print_memory_usage()

---
## 2. Load Evaluation Data

In [None]:
# Load KorMedMCQA test set
if os.path.exists(EVAL_DATA_DIR):
    eval_dataset = load_from_disk(EVAL_DATA_DIR)
    print(f"Loaded evaluation dataset: {len(eval_dataset)} examples")
else:
    # Load directly from HuggingFace
    print("Loading KorMedMCQA from HuggingFace...")
    eval_dataset = load_dataset("sean0042/KorMedMCQA", split="test")
    print(f"Loaded: {len(eval_dataset)} examples")

In [None]:
# Preview sample
sample = eval_dataset[0]
print("Sample evaluation example:")
print(f"Question: {sample['question']}")
print(f"Choices: A={sample['A']}, B={sample['B']}, C={sample['C']}, D={sample['D']}, E={sample['E']}")
print(f"Answer: {sample['answer']}")

---
## 3. Evaluate on KorMedMCQA

In [None]:
def create_mcqa_prompt(example):
    """Create evaluation prompt for MCQA"""
    
    question = example["question"]
    choices = []
    for letter in ['A', 'B', 'C', 'D', 'E']:
        if letter in example and example[letter]:
            choices.append(f"{letter}. {example[letter]}")
    
    formatted_choices = "\n".join(choices)
    
    prompt = f"""<|im_start|>system
당신은 한국어 의료 전문 AI 어시스턴트입니다. 정확하고 도움이 되는 의료 정보를 제공하세요.
<|im_end|>
<|im_start|>user
다음 의료 관련 질문에 답하세요. 정답 알파벳(A, B, C, D, E 중 하나)만 답하세요.

질문: {question}

선택지:
{formatted_choices}

정답:
<|im_end|>
<|im_start|>assistant
"""
    
    return prompt

In [None]:
def extract_answer(response):
    """Extract answer letter from model response"""
    
    # Look for A, B, C, D, E at the beginning
    response = response.strip().upper()
    
    # Check for direct letter
    for letter in ['A', 'B', 'C', 'D', 'E']:
        if response.startswith(letter):
            return letter
    
    # Check for number mapping
    number_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
    for num, letter in number_to_letter.items():
        if num in response[:5]:
            return letter
    
    # Check anywhere in response
    for letter in ['A', 'B', 'C', 'D', 'E']:
        if letter in response[:20]:
            return letter
    
    return None

In [None]:
# Convert answer index to letter
def answer_idx_to_letter(idx):
    """Convert 1-indexed answer to letter"""
    mapping = {1: 'A', 2: 'B', 3: 'C', 4: 'D', 5: 'E'}
    return mapping.get(idx, 'A')

In [None]:
# Run evaluation
print("\nRunning KorMedMCQA evaluation...")
print("=" * 60)

correct = 0
total = 0
results = []

# Limit for testing (remove for full evaluation)
max_samples = min(len(eval_dataset), 100)  # Evaluate first 100 for speed

for i, example in enumerate(tqdm(eval_dataset, total=max_samples)):
    if i >= max_samples:
        break
    
    # Create prompt
    prompt = create_mcqa_prompt(example)
    
    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode response (only new tokens)
    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )
    
    # Extract predicted answer
    predicted = extract_answer(response)
    
    # Get correct answer
    correct_answer = answer_idx_to_letter(example["answer"])
    
    # Check correctness
    is_correct = predicted == correct_answer
    if is_correct:
        correct += 1
    total += 1
    
    # Save result
    results.append({
        "question": example["question"],
        "predicted": predicted,
        "correct_answer": correct_answer,
        "is_correct": is_correct,
        "response": response,
    })

# Calculate accuracy
accuracy = correct / total * 100

print(f"\n" + "=" * 60)
print(f"KorMedMCQA Results")
print(f"=" * 60)
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")

In [None]:
# Show some examples
print("\nSample predictions:")
print("-" * 60)

for i in range(min(5, len(results))):
    r = results[i]
    status = "✓" if r["is_correct"] else "✗"
    print(f"\n{status} Q: {r['question'][:80]}...")
    print(f"   Predicted: {r['predicted']}, Correct: {r['correct_answer']}")
    print(f"   Response: {r['response'][:50]}")

---
## 4. Qualitative Evaluation

In [None]:
# Test with open-ended questions
test_questions = [
    "고혈압의 주요 증상과 위험 요인은 무엇인가요?",
    "당뇨병 환자가 일상에서 주의해야 할 점은 무엇인가요?",
    "감기와 독감의 차이점을 설명해주세요.",
    "두통이 자주 발생할 때 어떻게 대처해야 하나요?",
]

print("Qualitative evaluation (open-ended questions):")
print("=" * 60)

for question in test_questions:
    prompt = f"""<|im_start|>system
당신은 한국어 의료 전문 AI 어시스턴트입니다.
<|im_end|>
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )
    
    print(f"\nQ: {question}")
    print(f"A: {response[:500]}...")
    print("-" * 40)

---
## 5. Save Results

In [None]:
# Save evaluation results
eval_results = {
    "model": MODEL_DIR,
    "benchmark": "KorMedMCQA",
    "accuracy": accuracy,
    "correct": correct,
    "total": total,
    "results": results,
}

results_path = f"{RESULTS_DIR}/kormedmcqa_eval.json"
with open(results_path, "w", encoding="utf-8") as f:
    json.dump(eval_results, f, ensure_ascii=False, indent=2)

print(f"\nResults saved to {results_path}")

In [None]:
print("\n" + "=" * 60)
print("Korean Medical Evaluation Complete!")
print("=" * 60)
print(f"\nKorMedMCQA Accuracy: {accuracy:.2f}%")
print(f"\nResults saved to: {results_path}")
print("\nNext steps:")
print("  Run 02_evaluate_english.ipynb to check English retention")