# Phase 0.4: Preprocess and Format Training Data

Prepare and format all collected data for training stages.

## Contents
1. Load Collected Data
2. Format for Language Modeling (Stage 1-5)
3. Format for Instruction Tuning (Stage 6-7)
4. Create Mixed Dataset (Korean + English)
5. Create Evaluation Splits
6. Save All Processed Data

In [None]:
# Setup
import sys
import os
import json
import re
from tqdm import tqdm
sys.path.append("..")

from datasets import load_from_disk, Dataset, DatasetDict, concatenate_datasets
import random

# Directories
DATA_DIR = "../data"
RAW_DIR = f"{DATA_DIR}/raw"
PROCESSED_DIR = f"{DATA_DIR}/processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Raw data: {RAW_DIR}")
print(f"Processed data: {PROCESSED_DIR}")

---
## 1. Load Collected Data

In [None]:
# Check what data is available
print("Available raw data:")
for item in os.listdir(RAW_DIR):
    path = os.path.join(RAW_DIR, item)
    if os.path.isdir(path):
        print(f"  [DIR] {item}")
    else:
        size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"  [FILE] {item} ({size_mb:.1f} MB)")

In [None]:
# Load KorMedMCQA
kormedmcqa_path = f"{RAW_DIR}/kormedmcqa"
if os.path.exists(kormedmcqa_path):
    kormedmcqa = load_from_disk(kormedmcqa_path)
    print(f"Loaded KorMedMCQA: {kormedmcqa}")
else:
    print("KorMedMCQA not found, run 02_collect_korean_medical.ipynb first")
    kormedmcqa = None

In [None]:
# Load Medical Wikipedia
wiki_medical_path = f"{RAW_DIR}/wiki_medical_ko"
if os.path.exists(wiki_medical_path):
    wiki_medical = load_from_disk(wiki_medical_path)
    print(f"Loaded Medical Wikipedia: {len(wiki_medical)} articles")
else:
    print("Medical Wikipedia not found")
    wiki_medical = None

In [None]:
# Load Medical Reasoning (optional)
med_reasoning_path = f"{RAW_DIR}/medical_reasoning_kormedmcqa"
if os.path.exists(med_reasoning_path):
    med_reasoning = load_from_disk(med_reasoning_path)
    print(f"Loaded Medical Reasoning: {med_reasoning}")
else:
    print("Medical Reasoning not found")
    med_reasoning = None

---
## 2. Format for Language Modeling (Stage 1-5)

Plain text format for embedding training stages.

In [None]:
def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove very short lines
    text = text.strip()
    
    return text

def format_for_lm(examples):
    """Format examples for language modeling"""
    texts = []
    
    for text in examples["text"]:
        cleaned = clean_text(text)
        if len(cleaned) > 100:  # Minimum length
            texts.append(cleaned)
    
    return {"text": texts}

In [None]:
# Process Medical Wikipedia for LM
if wiki_medical:
    lm_texts = []
    
    for article in tqdm(wiki_medical, desc="Processing Wikipedia"):
        text = clean_text(article["text"])
        if len(text) > 100:
            lm_texts.append({"text": text})
    
    wiki_lm = Dataset.from_list(lm_texts)
    print(f"Medical Wikipedia for LM: {len(wiki_lm)} documents")

In [None]:
# Load tokenizer corpus and create LM dataset
tokenizer_corpus_path = f"{RAW_DIR}/korean_corpus_for_tokenizer.txt"

if os.path.exists(tokenizer_corpus_path):
    print("Loading tokenizer corpus for LM training...")
    
    lm_general_texts = []
    with open(tokenizer_corpus_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Reading corpus")):
            text = clean_text(line)
            if len(text) > 100:
                lm_general_texts.append({"text": text})
            
            # Limit to manageable size for LM training
            if len(lm_general_texts) >= 1000000:  # 1M documents
                break
    
    general_lm = Dataset.from_list(lm_general_texts)
    print(f"General Korean corpus for LM: {len(general_lm)} documents")
else:
    print("Tokenizer corpus not found")
    general_lm = None

In [None]:
# Combine LM datasets
lm_datasets = []

if wiki_medical:
    lm_datasets.append(wiki_lm)
    
if general_lm:
    lm_datasets.append(general_lm)

if lm_datasets:
    combined_lm = concatenate_datasets(lm_datasets)
    
    # Shuffle
    combined_lm = combined_lm.shuffle(seed=42)
    
    print(f"\nCombined LM dataset: {len(combined_lm)} documents")
    print(f"Sample: {combined_lm[0]['text'][:200]}...")

---
## 3. Format for Instruction Tuning

ChatML format for medical QA instruction tuning.

In [None]:
# Instruction template
SYSTEM_PROMPT_KO = """당신은 한국어 의료 전문 AI 어시스턴트입니다. 정확하고 도움이 되는 의료 정보를 제공하세요. 의료 질문에 대해 전문적이고 이해하기 쉬운 답변을 제공합니다."""

def format_kormedmcqa_for_instruction(example):
    """Format KorMedMCQA as instruction-following data"""
    
    question = example["question"]
    
    # KorMedMCQA has choices in separate columns A, B, C, D, E
    choices = []
    for letter in ['A', 'B', 'C', 'D', 'E']:
        if letter in example and example[letter]:
            choices.append(example[letter])
    
    answer_idx = example["answer"]  # 1-indexed in the dataset
    
    # Format choices
    formatted_choices = "\n".join([f"{i+1}. {c}" for i, c in enumerate(choices)])
    
    # Create user message
    user_message = f"{question}\n\n{formatted_choices}\n\n위 질문에 대한 정답을 선택하고 설명해주세요."
    
    # Create assistant response
    correct_answer = choices[answer_idx - 1] if answer_idx > 0 and answer_idx <= len(choices) else choices[0]
    assistant_message = f"정답은 {answer_idx}번입니다.\n\n{correct_answer}\n\n이 답이 정답인 이유는 해당 의학적 지식에 기반하여 가장 적절한 선택이기 때문입니다."
    
    # Add Chain-of-Thought if available
    if example.get("cot"):
        assistant_message = f"{example['cot']}\n\n따라서 정답은 {answer_idx}번 '{correct_answer}'입니다."
    
    # Format as ChatML
    text = f"""<|im_start|>system
{SYSTEM_PROMPT_KO}
<|im_end|>
<|im_start|>user
{user_message}
<|im_end|>
<|im_start|>assistant
{assistant_message}
<|im_end|>"""
    
    return {"text": text}

# Test
if kormedmcqa:
    test_example = kormedmcqa["train"][0]
    print("Test example keys:", test_example.keys())
    formatted = format_kormedmcqa_for_instruction(test_example)
    print("\nSample formatted instruction:")
    print(formatted["text"][:1000])

In [None]:
# Format all KorMedMCQA data for instruction tuning
if kormedmcqa:
    instruction_data = []
    
    for split in kormedmcqa.keys():
        for example in tqdm(kormedmcqa[split], desc=f"Formatting {split}"):
            formatted = format_kormedmcqa_for_instruction(example)
            instruction_data.append(formatted)
    
    instruction_dataset = Dataset.from_list(instruction_data)
    print(f"\nInstruction dataset: {len(instruction_dataset)} examples")

In [None]:
# Add Chain-of-Thought examples if available
if med_reasoning:
    print("Adding Chain-of-Thought examples...")
    
    cot_data = []
    split_name = list(med_reasoning.keys())[0]
    
    for example in tqdm(med_reasoning[split_name], desc="Formatting CoT"):
        # Check what fields are available
        if "reasoning" in example:
            question = example.get("question", "")
            reasoning = example.get("reasoning", "")
            answer = example.get("answer", "")
            
            user_message = f"{question}\n\n단계별로 생각하며 답변해주세요."
            assistant_message = f"{reasoning}\n\n따라서 정답은 {answer}입니다."
            
            text = f"""<|im_start|>system
{SYSTEM_PROMPT_KO}
<|im_end|>
<|im_start|>user
{user_message}
<|im_end|>
<|im_start|>assistant
{assistant_message}
<|im_end|>"""
            
            cot_data.append({"text": text})
    
    if cot_data:
        cot_dataset = Dataset.from_list(cot_data)
        print(f"CoT dataset: {len(cot_dataset)} examples")
        
        # Combine with instruction dataset
        instruction_dataset = concatenate_datasets([instruction_dataset, cot_dataset])
        print(f"Combined instruction dataset: {len(instruction_dataset)} examples")

---
## 4. Create Mixed Dataset (Korean 90% + English 10%)

For Stage 6 training to prevent catastrophic forgetting.

In [None]:
# Note: In practice, you would load English medical data here
# For now, we'll create a placeholder structure

def create_mixed_dataset(korean_data, english_ratio=0.1):
    """
    Create mixed Korean/English dataset.
    
    In practice, load English medical data from:
    - MedQA
    - PubMed abstracts
    - Medical textbooks
    """
    
    korean_size = len(korean_data)
    target_english_size = int(korean_size * english_ratio / (1 - english_ratio))
    
    print(f"Korean data: {korean_size}")
    print(f"Target English data: {target_english_size}")
    print("\nNote: Load actual English medical data for production use.")
    
    # For now, return Korean-only
    return korean_data

if combined_lm:
    mixed_lm = create_mixed_dataset(combined_lm)

---
## 5. Create Train/Validation Splits

In [None]:
def create_splits(dataset, test_size=0.1, seed=42):
    """Split dataset into train and validation"""
    
    split_data = dataset.train_test_split(test_size=test_size, seed=seed)
    
    return DatasetDict({
        "train": split_data["train"],
        "validation": split_data["test"],
    })

In [None]:
# Create splits for LM dataset
if combined_lm:
    lm_splits = create_splits(combined_lm, test_size=0.05)
    print(f"LM dataset splits:")
    print(f"  Train: {len(lm_splits['train'])}")
    print(f"  Validation: {len(lm_splits['validation'])}")

In [None]:
# Create splits for instruction dataset
if instruction_dataset:
    instruction_splits = create_splits(instruction_dataset, test_size=0.1)
    print(f"Instruction dataset splits:")
    print(f"  Train: {len(instruction_splits['train'])}")
    print(f"  Validation: {len(instruction_splits['validation'])}")

---
## 6. Save Processed Data

In [None]:
# Save LM dataset
if lm_splits:
    lm_path = f"{PROCESSED_DIR}/korean_medical_lm"
    lm_splits.save_to_disk(lm_path)
    print(f"Saved LM dataset to {lm_path}")

In [None]:
# Save instruction dataset
if instruction_splits:
    instruction_path = f"{PROCESSED_DIR}/korean_medical_instruction"
    instruction_splits.save_to_disk(instruction_path)
    print(f"Saved instruction dataset to {instruction_path}")

In [None]:
# Save evaluation dataset (KorMedMCQA test set)
if kormedmcqa and "test" in kormedmcqa:
    eval_path = f"{PROCESSED_DIR}/kormedmcqa_eval"
    kormedmcqa["test"].save_to_disk(eval_path)
    print(f"Saved evaluation dataset to {eval_path}")

In [None]:
# Create processing summary
import datetime

summary = {
    "datasets": {},
    "processing_date": str(datetime.datetime.now()),
}

if 'lm_splits' in dir() and lm_splits:
    summary["datasets"]["korean_medical_lm"] = {
        "path": f"{PROCESSED_DIR}/korean_medical_lm",
        "train_size": len(lm_splits["train"]),
        "validation_size": len(lm_splits["validation"]),
        "use": "Stage 1-5 (Embedding training)",
    }

if 'instruction_splits' in dir() and instruction_splits:
    summary["datasets"]["korean_medical_instruction"] = {
        "path": f"{PROCESSED_DIR}/korean_medical_instruction",
        "train_size": len(instruction_splits["train"]),
        "validation_size": len(instruction_splits["validation"]),
        "use": "Stage 6-7 (Instruction tuning)",
    }

if kormedmcqa and "test" in kormedmcqa:
    summary["datasets"]["kormedmcqa_eval"] = {
        "path": f"{PROCESSED_DIR}/kormedmcqa_eval",
        "size": len(kormedmcqa["test"]),
        "use": "Evaluation",
    }

# Save summary
summary_path = f"{PROCESSED_DIR}/processing_summary.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\n" + "=" * 60)
print("Data Processing Summary")
print("=" * 60)
print(json.dumps(summary, indent=2, ensure_ascii=False))

In [None]:
print("\n" + "=" * 60)
print("Phase 0: Data Preparation Complete!")
print("=" * 60)
print(f"\nProcessed data saved to: {PROCESSED_DIR}")
print("\nNext steps:")
print("  1. Move to Phase 1: Tokenizer Training")
print("  2. Run phase1_tokenizer/01_train_korean_tokenizer.ipynb")