# Phase 0.4: Preprocess and Format Training Data

Prepare and format all collected data for training stages.

## Contents
1. Load Collected Data
2. Format for Language Modeling (Stage 1-5)
3. Format for Instruction Tuning (Stage 6-7)
4. Create Mixed Dataset (Korean + English)
5. Create Evaluation Splits
6. Save All Processed Data

In [1]:
# Setup
import sys
import os
import json
import re
from tqdm import tqdm
sys.path.append("..")

from datasets import load_from_disk, Dataset, DatasetDict, concatenate_datasets
import random

# Directories
DATA_DIR = "../data"
RAW_DIR = f"{DATA_DIR}/raw"
PROCESSED_DIR = f"{DATA_DIR}/processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Raw data: {RAW_DIR}")
print(f"Processed data: {PROCESSED_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Raw data: ../data/raw
Processed data: ../data/processed


---
## 1. Load Collected Data

In [2]:
# Check what data is available
print("Available raw data:")
for item in os.listdir(RAW_DIR):
    path = os.path.join(RAW_DIR, item)
    if os.path.isdir(path):
        print(f"  [DIR] {item}")
    else:
        size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"  [FILE] {item} ({size_mb:.1f} MB)")

Available raw data:
  [DIR] kormedmcqa
  [DIR] medical_reasoning_kormedmcqa
  [FILE] korean_corpus_for_tokenizer.txt (221.6 MB)


In [3]:
# Load KorMedMCQA
kormedmcqa_path = f"{RAW_DIR}/kormedmcqa"
if os.path.exists(kormedmcqa_path):
    kormedmcqa = load_from_disk(kormedmcqa_path)
    print(f"Loaded KorMedMCQA: {kormedmcqa}")
else:
    print("KorMedMCQA not found, run 02_collect_korean_medical.ipynb first")
    kormedmcqa = None

Loaded KorMedMCQA: DatasetDict({
    train: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot', 'exam_type'],
        num_rows: 1890
    })
    test: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot', 'exam_type'],
        num_rows: 604
    })
})


In [4]:
# Load Medical Wikipedia
wiki_medical_path = f"{RAW_DIR}/wiki_medical_ko"
if os.path.exists(wiki_medical_path):
    wiki_medical = load_from_disk(wiki_medical_path)
    print(f"Loaded Medical Wikipedia: {len(wiki_medical)} articles")
else:
    print("Medical Wikipedia not found")
    wiki_medical = None

Medical Wikipedia not found


In [5]:
# Load Medical Reasoning (optional)
med_reasoning_path = f"{RAW_DIR}/medical_reasoning_kormedmcqa"
if os.path.exists(med_reasoning_path):
    med_reasoning = load_from_disk(med_reasoning_path)
    print(f"Loaded Medical Reasoning: {med_reasoning}")
else:
    print("Medical Reasoning not found")
    med_reasoning = None

Loaded Medical Reasoning: DatasetDict({
    train: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'thinking', 'response', '__index_level_0__'],
        num_rows: 8751
    })
})


---
## 2. Format for Language Modeling (Stage 1-5)

Plain text format for embedding training stages.

In [6]:
def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove very short lines
    text = text.strip()
    
    return text

def format_for_lm(examples):
    """Format examples for language modeling"""
    texts = []
    
    for text in examples["text"]:
        cleaned = clean_text(text)
        if len(cleaned) > 100:  # Minimum length
            texts.append(cleaned)
    
    return {"text": texts}

In [7]:
# Process Medical Wikipedia for LM
if wiki_medical:
    lm_texts = []
    
    for article in tqdm(wiki_medical, desc="Processing Wikipedia"):
        text = clean_text(article["text"])
        if len(text) > 100:
            lm_texts.append({"text": text})
    
    wiki_lm = Dataset.from_list(lm_texts)
    print(f"Medical Wikipedia for LM: {len(wiki_lm)} documents")

In [8]:
# Load tokenizer corpus and create LM dataset
tokenizer_corpus_path = f"{RAW_DIR}/korean_corpus_for_tokenizer.txt"

if os.path.exists(tokenizer_corpus_path):
    print("Loading tokenizer corpus for LM training...")
    
    lm_general_texts = []
    with open(tokenizer_corpus_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Reading corpus")):
            text = clean_text(line)
            if len(text) > 100:
                lm_general_texts.append({"text": text})
            
            # Limit to manageable size for LM training
            if len(lm_general_texts) >= 1000000:  # 1M documents
                break
    
    general_lm = Dataset.from_list(lm_general_texts)
    print(f"General Korean corpus for LM: {len(general_lm)} documents")
else:
    print("Tokenizer corpus not found")
    general_lm = None

Loading tokenizer corpus for LM training...


Reading corpus: 0it [00:00, ?it/s]

Reading corpus: 285it [00:00, 2839.71it/s]

Reading corpus: 693it [00:00, 3567.07it/s]

Reading corpus: 1050it [00:00, 3500.95it/s]

Reading corpus: 1491it [00:00, 3855.18it/s]

Reading corpus: 2310it [00:00, 5405.73it/s]

Reading corpus: 2986it [00:00, 5861.61it/s]

Reading corpus: 3574it [00:00, 4988.04it/s]

Reading corpus: 4095it [00:00, 4903.22it/s]

Reading corpus: 4600it [00:00, 4729.88it/s]

Reading corpus: 5083it [00:01, 4472.15it/s]

Reading corpus: 5542it [00:01, 4503.81it/s]

Reading corpus: 6023it [00:01, 4586.22it/s]

Reading corpus: 6619it [00:01, 4977.65it/s]

Reading corpus: 8412it [00:01, 8727.74it/s]

Reading corpus: 9332it [00:01, 8857.00it/s]

Reading corpus: 10232it [00:01, 8008.32it/s]

Reading corpus: 11058it [00:01, 6962.71it/s]

Reading corpus: 11793it [00:02, 6121.42it/s]

Reading corpus: 12445it [00:02, 5839.92it/s]

Reading corpus: 13118it [00:02, 6055.54it/s]

Reading corpus: 13747it [00:02, 5989.33it/s]

Reading corpus: 14436it [00:02, 6226.73it/s]

Reading corpus: 15073it [00:02, 6050.97it/s]

Reading corpus: 15915it [00:02, 6700.20it/s]

Reading corpus: 16687it [00:02, 6986.89it/s]

Reading corpus: 17408it [00:02, 7048.03it/s]

Reading corpus: 18251it [00:03, 7445.21it/s]

Reading corpus: 19296it [00:03, 8318.62it/s]

Reading corpus: 20247it [00:03, 8664.38it/s]

Reading corpus: 21119it [00:03, 7909.70it/s]

Reading corpus: 21926it [00:03, 7706.82it/s]

Reading corpus: 22766it [00:03, 7899.30it/s]

Reading corpus: 23566it [00:03, 7384.16it/s]

Reading corpus: 24317it [00:03, 7267.62it/s]

Reading corpus: 25129it [00:03, 7502.83it/s]

Reading corpus: 25928it [00:03, 7639.23it/s]

Reading corpus: 26715it [00:04, 7705.20it/s]

Reading corpus: 27490it [00:04, 7660.27it/s]

Reading corpus: 28260it [00:04, 7663.72it/s]

Reading corpus: 29029it [00:04, 7052.48it/s]

Reading corpus: 30277it [00:04, 8562.47it/s]

Reading corpus: 31152it [00:04, 8443.81it/s]

Reading corpus: 32088it [00:04, 8694.03it/s]

Reading corpus: 33037it [00:04, 8923.21it/s]

Reading corpus: 33967it [00:04, 9030.43it/s]

Reading corpus: 34876it [00:05, 8732.88it/s]

Reading corpus: 35756it [00:05, 8544.60it/s]

Reading corpus: 36615it [00:05, 7766.88it/s]

Reading corpus: 37407it [00:05, 7641.24it/s]

Reading corpus: 38308it [00:05, 8016.47it/s]

Reading corpus: 39120it [00:05, 7806.84it/s]

Reading corpus: 39968it [00:05, 7995.20it/s]

Reading corpus: 40787it [00:05, 8046.99it/s]

Reading corpus: 41748it [00:05, 8499.10it/s]

Reading corpus: 42692it [00:06, 8773.98it/s]

Reading corpus: 43566it [00:06, 7155.13it/s]




General Korean corpus for LM: 43365 documents


In [9]:
# Combine LM datasets
lm_datasets = []

if wiki_medical:
    lm_datasets.append(wiki_lm)
    
if general_lm:
    lm_datasets.append(general_lm)

if lm_datasets:
    combined_lm = concatenate_datasets(lm_datasets)
    
    # Shuffle
    combined_lm = combined_lm.shuffle(seed=42)
    
    print(f"\nCombined LM dataset: {len(combined_lm)} documents")
    print(f"Sample: {combined_lm[0]['text'][:200]}...")


Combined LM dataset: 43365 documents
Sample: 555(오백오십오)는 554보다 크고 556보다 작은 자연수이다. 수학 합성수로, 그 약수는 1, 3, 5, 15, 37, 111, 185, 555이다. 기타 미국에서는 영화, 드라마, 만화, 노래 등에서 가상의 전화번호를 사용할 경우 반드시 555로 시작하는 번호를 사용해야 한다는 규정이 있다. 5e02 555...


---
## 3. Format for Instruction Tuning

ChatML format for medical QA instruction tuning.

In [10]:
# Instruction template
SYSTEM_PROMPT_KO = """당신은 한국어 의료 전문 AI 어시스턴트입니다. 정확하고 도움이 되는 의료 정보를 제공하세요. 의료 질문에 대해 전문적이고 이해하기 쉬운 답변을 제공합니다."""

def format_kormedmcqa_for_instruction(example):
    """Format KorMedMCQA as instruction-following data"""
    
    question = example["question"]
    
    # KorMedMCQA has choices in separate columns A, B, C, D, E
    choices = []
    for letter in ['A', 'B', 'C', 'D', 'E']:
        if letter in example and example[letter]:
            choices.append(example[letter])
    
    answer_idx = example["answer"]  # 1-indexed in the dataset
    
    # Format choices
    formatted_choices = "\n".join([f"{i+1}. {c}" for i, c in enumerate(choices)])
    
    # Create user message
    user_message = f"{question}\n\n{formatted_choices}\n\n위 질문에 대한 정답을 선택하고 설명해주세요."
    
    # Create assistant response
    correct_answer = choices[answer_idx - 1] if answer_idx > 0 and answer_idx <= len(choices) else choices[0]
    assistant_message = f"정답은 {answer_idx}번입니다.\n\n{correct_answer}\n\n이 답이 정답인 이유는 해당 의학적 지식에 기반하여 가장 적절한 선택이기 때문입니다."
    
    # Add Chain-of-Thought if available
    if example.get("cot"):
        assistant_message = f"{example['cot']}\n\n따라서 정답은 {answer_idx}번 '{correct_answer}'입니다."
    
    # Format as ChatML
    text = f"""<|im_start|>system
{SYSTEM_PROMPT_KO}
<|im_end|>
<|im_start|>user
{user_message}
<|im_end|>
<|im_start|>assistant
{assistant_message}
<|im_end|>"""
    
    return {"text": text}

# Test
if kormedmcqa:
    test_example = kormedmcqa["train"][0]
    print("Test example keys:", test_example.keys())
    formatted = format_kormedmcqa_for_instruction(test_example)
    print("\nSample formatted instruction:")
    print(formatted["text"][:1000])

Test example keys: dict_keys(['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot', 'exam_type'])

Sample formatted instruction:
<|im_start|>system
당신은 한국어 의료 전문 AI 어시스턴트입니다. 정확하고 도움이 되는 의료 정보를 제공하세요. 의료 질문에 대해 전문적이고 이해하기 쉬운 답변을 제공합니다.
<|im_end|>
<|im_start|>user
항문압 측정 검사에서 항문 압력이 증가하는 경우는?

1. 직장질루(rectovaginal fistula)
2. 항문열창(anal fissure)
3. 대변실금(fecal incontinence)
4. 대변메막힘(fecal impaction)
5. 직장탈출증(rectal prolapse)

위 질문에 대한 정답을 선택하고 설명해주세요.
<|im_end|>
<|im_start|>assistant
정답은 2번입니다.

항문열창(anal fissure)

이 답이 정답인 이유는 해당 의학적 지식에 기반하여 가장 적절한 선택이기 때문입니다.
<|im_end|>


In [11]:
# Format all KorMedMCQA data for instruction tuning
if kormedmcqa:
    instruction_data = []
    
    for split in kormedmcqa.keys():
        for example in tqdm(kormedmcqa[split], desc=f"Formatting {split}"):
            formatted = format_kormedmcqa_for_instruction(example)
            instruction_data.append(formatted)
    
    instruction_dataset = Dataset.from_list(instruction_data)
    print(f"\nInstruction dataset: {len(instruction_dataset)} examples")

Formatting train:   0%|          | 0/1890 [00:00<?, ?it/s]

Formatting train:  45%|████▍     | 849/1890 [00:00<00:00, 8489.62it/s]

Formatting train:  98%|█████████▊| 1849/1890 [00:00<00:00, 9374.55it/s]

Formatting train: 100%|██████████| 1890/1890 [00:00<00:00, 9202.23it/s]




Formatting test:   0%|          | 0/604 [00:00<?, ?it/s]

Formatting test: 100%|██████████| 604/604 [00:00<00:00, 9960.25it/s]


Instruction dataset: 2494 examples





In [12]:
# Add Chain-of-Thought examples if available
if med_reasoning:
    print("Adding Chain-of-Thought examples...")
    
    cot_data = []
    split_name = list(med_reasoning.keys())[0]
    
    for example in tqdm(med_reasoning[split_name], desc="Formatting CoT"):
        # Check what fields are available
        if "reasoning" in example:
            question = example.get("question", "")
            reasoning = example.get("reasoning", "")
            answer = example.get("answer", "")
            
            user_message = f"{question}\n\n단계별로 생각하며 답변해주세요."
            assistant_message = f"{reasoning}\n\n따라서 정답은 {answer}입니다."
            
            text = f"""<|im_start|>system
{SYSTEM_PROMPT_KO}
<|im_end|>
<|im_start|>user
{user_message}
<|im_end|>
<|im_start|>assistant
{assistant_message}
<|im_end|>"""
            
            cot_data.append({"text": text})
    
    if cot_data:
        cot_dataset = Dataset.from_list(cot_data)
        print(f"CoT dataset: {len(cot_dataset)} examples")
        
        # Combine with instruction dataset
        instruction_dataset = concatenate_datasets([instruction_dataset, cot_dataset])
        print(f"Combined instruction dataset: {len(instruction_dataset)} examples")

Adding Chain-of-Thought examples...


Formatting CoT:   0%|          | 0/8751 [00:00<?, ?it/s]

Formatting CoT:  10%|█         | 907/8751 [00:00<00:00, 9062.19it/s]

Formatting CoT:  21%|██        | 1814/8751 [00:00<00:00, 9065.17it/s]

Formatting CoT:  31%|███       | 2721/8751 [00:00<00:00, 9002.88it/s]

Formatting CoT:  41%|████▏     | 3622/8751 [00:00<00:00, 8965.09it/s]

Formatting CoT:  52%|█████▏    | 4519/8751 [00:00<00:00, 8952.52it/s]

Formatting CoT:  62%|██████▏   | 5422/8751 [00:00<00:00, 8976.72it/s]

Formatting CoT:  72%|███████▏  | 6320/8751 [00:00<00:00, 8976.22it/s]

Formatting CoT:  82%|████████▏ | 7218/8751 [00:00<00:00, 8904.64it/s]

Formatting CoT:  93%|█████████▎| 8109/8751 [00:00<00:00, 8902.84it/s]

Formatting CoT: 100%|██████████| 8751/8751 [00:00<00:00, 8956.20it/s]




---
## 4. Create Mixed Dataset (Korean 90% + English 10%)

For Stage 6 training to prevent catastrophic forgetting.

In [13]:
# Note: In practice, you would load English medical data here
# For now, we'll create a placeholder structure

def create_mixed_dataset(korean_data, english_ratio=0.1):
    """
    Create mixed Korean/English dataset.
    
    In practice, load English medical data from:
    - MedQA
    - PubMed abstracts
    - Medical textbooks
    """
    
    korean_size = len(korean_data)
    target_english_size = int(korean_size * english_ratio / (1 - english_ratio))
    
    print(f"Korean data: {korean_size}")
    print(f"Target English data: {target_english_size}")
    print("\nNote: Load actual English medical data for production use.")
    
    # For now, return Korean-only
    return korean_data

if combined_lm:
    mixed_lm = create_mixed_dataset(combined_lm)

Korean data: 43365
Target English data: 4818

Note: Load actual English medical data for production use.


---
## 5. Create Train/Validation Splits

In [14]:
def create_splits(dataset, test_size=0.1, seed=42):
    """Split dataset into train and validation"""
    
    split_data = dataset.train_test_split(test_size=test_size, seed=seed)
    
    return DatasetDict({
        "train": split_data["train"],
        "validation": split_data["test"],
    })

In [15]:
# Create splits for LM dataset
if combined_lm:
    lm_splits = create_splits(combined_lm, test_size=0.05)
    print(f"LM dataset splits:")
    print(f"  Train: {len(lm_splits['train'])}")
    print(f"  Validation: {len(lm_splits['validation'])}")

LM dataset splits:
  Train: 41196
  Validation: 2169


In [16]:
# Create splits for instruction dataset
if instruction_dataset:
    instruction_splits = create_splits(instruction_dataset, test_size=0.1)
    print(f"Instruction dataset splits:")
    print(f"  Train: {len(instruction_splits['train'])}")
    print(f"  Validation: {len(instruction_splits['validation'])}")

Instruction dataset splits:
  Train: 2244
  Validation: 250


---
## 6. Save Processed Data

In [17]:
# Save LM dataset
if lm_splits:
    lm_path = f"{PROCESSED_DIR}/korean_medical_lm"
    lm_splits.save_to_disk(lm_path)
    print(f"Saved LM dataset to {lm_path}")

Saving the dataset (0/1 shards):   0%|          | 0/41196 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):  29%|██▉       | 12000/41196 [00:00<00:00, 111155.48 examples/s]

Saving the dataset (0/1 shards):  58%|█████▊    | 24000/41196 [00:00<00:00, 113640.99 examples/s]

Saving the dataset (0/1 shards):  87%|████████▋ | 36000/41196 [00:00<00:00, 114154.18 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 41196/41196 [00:00<00:00, 114154.18 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 41196/41196 [00:00<00:00, 113110.23 examples/s]




Saving the dataset (0/1 shards):   0%|          | 0/2169 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 2169/2169 [00:00<00:00, 100802.72 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 2169/2169 [00:00<00:00, 97244.80 examples/s] 

Saved LM dataset to ../data/processed/korean_medical_lm





In [18]:
# Save instruction dataset
if instruction_splits:
    instruction_path = f"{PROCESSED_DIR}/korean_medical_instruction"
    instruction_splits.save_to_disk(instruction_path)
    print(f"Saved instruction dataset to {instruction_path}")

Saving the dataset (0/1 shards):   0%|          | 0/2244 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 2244/2244 [00:00<00:00, 168764.89 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 2244/2244 [00:00<00:00, 159369.06 examples/s]




Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 250/250 [00:00<00:00, 75021.54 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 250/250 [00:00<00:00, 59792.21 examples/s]

Saved instruction dataset to ../data/processed/korean_medical_instruction





In [19]:
# Save evaluation dataset (KorMedMCQA test set)
if kormedmcqa and "test" in kormedmcqa:
    eval_path = f"{PROCESSED_DIR}/kormedmcqa_eval"
    kormedmcqa["test"].save_to_disk(eval_path)
    print(f"Saved evaluation dataset to {eval_path}")

Saving the dataset (0/1 shards):   0%|          | 0/604 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 604/604 [00:00<00:00, 102681.57 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 604/604 [00:00<00:00, 89870.50 examples/s] 

Saved evaluation dataset to ../data/processed/kormedmcqa_eval





In [20]:
# Create processing summary
import datetime

summary = {
    "datasets": {},
    "processing_date": str(datetime.datetime.now()),
}

if 'lm_splits' in dir() and lm_splits:
    summary["datasets"]["korean_medical_lm"] = {
        "path": f"{PROCESSED_DIR}/korean_medical_lm",
        "train_size": len(lm_splits["train"]),
        "validation_size": len(lm_splits["validation"]),
        "use": "Stage 1-5 (Embedding training)",
    }

if 'instruction_splits' in dir() and instruction_splits:
    summary["datasets"]["korean_medical_instruction"] = {
        "path": f"{PROCESSED_DIR}/korean_medical_instruction",
        "train_size": len(instruction_splits["train"]),
        "validation_size": len(instruction_splits["validation"]),
        "use": "Stage 6-7 (Instruction tuning)",
    }

if kormedmcqa and "test" in kormedmcqa:
    summary["datasets"]["kormedmcqa_eval"] = {
        "path": f"{PROCESSED_DIR}/kormedmcqa_eval",
        "size": len(kormedmcqa["test"]),
        "use": "Evaluation",
    }

# Save summary
summary_path = f"{PROCESSED_DIR}/processing_summary.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\n" + "=" * 60)
print("Data Processing Summary")
print("=" * 60)
print(json.dumps(summary, indent=2, ensure_ascii=False))


Data Processing Summary
{
  "datasets": {
    "korean_medical_lm": {
      "path": "../data/processed/korean_medical_lm",
      "train_size": 41196,
      "validation_size": 2169,
      "use": "Stage 1-5 (Embedding training)"
    },
    "korean_medical_instruction": {
      "path": "../data/processed/korean_medical_instruction",
      "train_size": 2244,
      "validation_size": 250,
      "use": "Stage 6-7 (Instruction tuning)"
    },
    "kormedmcqa_eval": {
      "path": "../data/processed/kormedmcqa_eval",
      "size": 604,
      "use": "Evaluation"
    }
  },
  "processing_date": "2025-12-05 09:29:18.853930"
}


In [21]:
print("\n" + "=" * 60)
print("Phase 0: Data Preparation Complete!")
print("=" * 60)
print(f"\nProcessed data saved to: {PROCESSED_DIR}")
print("\nNext steps:")
print("  1. Move to Phase 1: Tokenizer Training")
print("  2. Run phase1_tokenizer/01_train_korean_tokenizer.ipynb")


Phase 0: Data Preparation Complete!

Processed data saved to: ../data/processed

Next steps:
  1. Move to Phase 1: Tokenizer Training
  2. Run phase1_tokenizer/01_train_korean_tokenizer.ipynb
