# Phase 0.2: Collect Korean Medical Datasets

Download and prepare Korean medical datasets for training.

## Contents
1. Setup and GPU Configuration
2. Download KorMedMCQA
3. Download Medical Reasoning Dataset
4. Download KorMedLawQA
5. Filter Medical Content from Wikipedia
6. Save All Datasets

In [1]:
# Setup
import sys
import os
sys.path.append("..")

# GPU setup
from config.gpu_utils import setup_gpu, print_memory_usage
device = setup_gpu()

# Imports
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from huggingface_hub import HfApi
import json
from tqdm import tqdm
import re

# Create data directories
DATA_DIR = "../data"
RAW_DIR = f"{DATA_DIR}/raw"
PROCESSED_DIR = f"{DATA_DIR}/processed"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")

Using GPU: NVIDIA RTX A6000
Memory: 47.4 GB


  from .autonotebook import tqdm as notebook_tqdm


Data directory: ../data


---
## 1. Download KorMedMCQA (Primary Medical Dataset)

In [2]:
print("Downloading KorMedMCQA (doctor config only to save memory)...")

# Load just doctor config to save memory
try:
    kormedmcqa_doctor = load_dataset("sean0042/KorMedMCQA", "doctor")
    print(f"Loaded doctor config: {kormedmcqa_doctor}")
    
    # Show sample
    print(f"\nSample entry:")
    split = 'train' if 'train' in kormedmcqa_doctor else list(kormedmcqa_doctor.keys())[0]
    sample = kormedmcqa_doctor[split][0]
    for key, value in sample.items():
        print(f"  {key}: {value}")
    
    kormedmcqa_all = {'doctor': kormedmcqa_doctor}
except Exception as e:
    print(f"Error loading KorMedMCQA: {e}")
    kormedmcqa_all = {}

Downloading KorMedMCQA (doctor config only to save memory)...


Loaded doctor config: DatasetDict({
    train: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot'],
        num_rows: 1890
    })
    dev: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot'],
        num_rows: 164
    })
    test: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot'],
        num_rows: 435
    })
    fewshot: Dataset({
        features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot'],
        num_rows: 5
    })
})

Sample entry:
  subject: doctor
  year: 2012
  period: 1
  q_number: 1
  question: 항문압 측정 검사에서 항문 압력이 증가하는 경우는?
  A: 직장질루(rectovaginal fistula)
  B: 항문열창(anal fissure)
  C: 대변실금(fecal incontinence)
  D: 대변메막힘(fecal impaction)
  E: 직장탈출증(rectal prolapse)
  answer: 2
  cot: 


In [3]:
# Quick statistics
print("Dataset Statistics:")
if kormedmcqa_all:
    for config, ds in kormedmcqa_all.items():
        for split in ds.keys():
            print(f"  {config}/{split}: {len(ds[split])} examples")
    
    # Clean up to save memory
    import gc
    gc.collect()

Dataset Statistics:
  doctor/train: 1890 examples
  doctor/dev: 164 examples
  doctor/test: 435 examples
  doctor/fewshot: 5 examples


In [4]:
# Save KorMedMCQA (all configs combined)
kormedmcqa_path = f"{RAW_DIR}/kormedmcqa"

# Combine all configs into one DatasetDict
from datasets import DatasetDict

combined_train = []
combined_test = []

for config, ds in kormedmcqa_all.items():
    for split in ds.keys():
        for item in ds[split]:
            item_dict = dict(item)
            item_dict['exam_type'] = config  # Add exam type as column
            if 'train' in split.lower():
                combined_train.append(item_dict)
            else:
                combined_test.append(item_dict)

# Create combined dataset
combined_ds = DatasetDict({
    'train': Dataset.from_list(combined_train) if combined_train else Dataset.from_list([]),
    'test': Dataset.from_list(combined_test) if combined_test else Dataset.from_list([]),
})

combined_ds.save_to_disk(kormedmcqa_path)
print(f"Saved combined KorMedMCQA to {kormedmcqa_path}")
print(f"  Train: {len(combined_ds['train'])}, Test: {len(combined_ds['test'])}")

Saving the dataset (0/1 shards):   0%|          | 0/1890 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 1890/1890 [00:00<00:00, 231134.93 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 1890/1890 [00:00<00:00, 210243.59 examples/s]




Saving the dataset (0/1 shards):   0%|          | 0/604 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 604/604 [00:00<00:00, 97264.82 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 604/604 [00:00<00:00, 84645.65 examples/s]

Saved combined KorMedMCQA to ../data/raw/kormedmcqa
  Train: 1890, Test: 604





---
## 2. Download Medical Reasoning Dataset (Chain-of-Thought)

In [5]:
print("Medical Reasoning KorMedMCQA:")
print("  Source: ChuGyouk/medical-reasoning-train-kormedmcqa")
print("  Skipping to save memory - can be loaded later")
med_reasoning = None

Medical Reasoning KorMedMCQA:
  Source: ChuGyouk/medical-reasoning-train-kormedmcqa
  Skipping to save memory - can be loaded later


---
## 3. Download KorMedLawQA

In [6]:
print("KorMedLawQA:")
print("  Source: snuh/KorMedLawQA")
print("  Skipping to save memory - can be loaded later")
kormedlawqa = None

KorMedLawQA:
  Source: snuh/KorMedLawQA
  Skipping to save memory - can be loaded later


---
## 4. Filter Medical Content from Korean Wikipedia

In [7]:
# Medical keywords for filtering
MEDICAL_KEYWORDS_KO = [
    # Diseases
    "질병", "질환", "증후군", "암", "종양", "감염", "바이러스", "세균",
    "당뇨", "고혈압", "뇌졸중", "심장", "폐렴", "간염", "신장",
    
    # Medical practice
    "의학", "의료", "치료", "진단", "수술", "처방", "투약", "주사",
    "병원", "의사", "간호사", "환자", "약사", "약물", "약품",
    
    # Body parts
    "심장", "폐", "간", "신장", "위장", "뇌", "혈관", "뼈", "근육",
    "피부", "눈", "귀", "코", "목", "장기",
    
    # Symptoms
    "증상", "통증", "발열", "기침", "두통", "피로", "구토", "설사",
    "염증", "부종", "출혈",
    
    # Medical specialties
    "내과", "외과", "소아과", "산부인과", "정신과", "피부과", "안과",
    "이비인후과", "치과", "응급의학", "마취과",
    
    # Health
    "건강", "면역", "예방", "백신", "검진", "혈액", "호르몬",
]

def is_medical_article(text, title=""):
    """Check if article is medical-related"""
    combined = (title + " " + text).lower()
    
    # Count keyword matches
    matches = sum(1 for kw in MEDICAL_KEYWORDS_KO if kw in combined)
    
    # Require at least 2 keyword matches
    return matches >= 2

print(f"Medical keywords: {len(MEDICAL_KEYWORDS_KO)}")

Medical keywords: 74


In [8]:
# Skip Wikipedia filtering for now - use smaller sample
print("Medical Wikipedia filtering:")
print("  Skipping full Wikipedia scan to save memory")
print("  Will use smaller sample for demo")

medical_articles = []
print(f"\nMedical articles placeholder: {len(medical_articles)}")

Medical Wikipedia filtering:
  Skipping full Wikipedia scan to save memory
  Will use smaller sample for demo

Medical articles placeholder: 0


In [9]:
# Placeholder for medical articles
print("Sample medical articles: (skipped)")
print("  Will be collected in full training run")

Sample medical articles: (skipped)
  Will be collected in full training run


In [10]:
# Skip saving medical Wikipedia for now
print("Medical Wikipedia saving: skipped")
wiki_medical = None

Medical Wikipedia saving: skipped


---
## 5. Download General Korean Corpus (for Tokenizer)

In [11]:
# Skip OSCAR for now - requires authentication
print("OSCAR Korean corpus:")
print("  Note: OSCAR-2301 is a GATED dataset requiring HuggingFace authentication")
print("  To access:")
print("    1) Accept terms at https://huggingface.co/datasets/oscar-corpus/OSCAR-2301")
print("    2) Run: huggingface-cli login")
print("\n  Skipping OSCAR download for now.")
print("  Alternative: Use Korean Wikipedia for tokenizer training corpus")

# Create placeholder
tokenizer_corpus = []
total_chars = 0

OSCAR Korean corpus:
  Note: OSCAR-2301 is a GATED dataset requiring HuggingFace authentication
  To access:
    1) Accept terms at https://huggingface.co/datasets/oscar-corpus/OSCAR-2301
    2) Run: huggingface-cli login

  Skipping OSCAR download for now.
  Alternative: Use Korean Wikipedia for tokenizer training corpus


In [12]:
# Create small sample corpus for tokenizer testing
print("Creating small sample corpus for tokenizer...")

wiki_corpus_path = f"{RAW_DIR}/korean_corpus_for_tokenizer.txt"

# Just collect a small sample (100MB) for testing
try:
    wiki_ko_stream = load_dataset("wikimedia/wikipedia", "20231101.ko", split="train", streaming=True)
    
    tokenizer_corpus = []
    total_chars = 0
    target_chars = 100 * 1024 * 1024  # 100MB for demo
    
    for i, article in enumerate(wiki_ko_stream):
        text = article.get("text", "")
        if len(text) < 100:
            continue
        
        tokenizer_corpus.append(text)
        total_chars += len(text)
        
        if total_chars >= target_chars:
            break
        
        if i % 10000 == 0 and i > 0:
            print(f"  Processed {i} articles, {total_chars / 1e6:.1f}MB")
    
    print(f"\nCollected {len(tokenizer_corpus)} articles, {total_chars / 1e6:.1f}MB")
    
    # Save
    with open(wiki_corpus_path, "w", encoding="utf-8") as f:
        for text in tokenizer_corpus:
            text = re.sub(r'\s+', ' ', text).strip()
            if text:
                f.write(text + "\n")
    
    file_size = os.path.getsize(wiki_corpus_path) / (1024**3)
    print(f"Saved! File size: {file_size:.3f}GB")

except Exception as e:
    print(f"Error collecting corpus: {e}")
    tokenizer_corpus = []
    file_size = 0

Creating small sample corpus for tokenizer...


  Processed 10000 articles, 28.3MB


  Processed 30000 articles, 75.2MB


  Processed 40000 articles, 95.0MB



Collected 43566 articles, 104.9MB


Saved! File size: 0.216GB


---
## 6. Summary

In [13]:
# Create summary of collected data
collected_datasets = {
    "medical": {
        "kormedmcqa": {
            "path": f"{RAW_DIR}/kormedmcqa",
            "size": len(combined_ds.get('train', [])) + len(combined_ds.get('test', [])),
            "type": "QA",
        },
    },
    "general_corpus": {},
}

# Add tokenizer corpus if we collected it
if tokenizer_corpus:
    collected_datasets["general_corpus"]["tokenizer_corpus"] = {
        "path": wiki_corpus_path,
        "size_gb": file_size,
        "num_texts": len(tokenizer_corpus),
    }

# Add optional datasets
if 'med_reasoning' in dir() and med_reasoning:
    collected_datasets["medical"]["medical_reasoning"] = {
        "path": f"{RAW_DIR}/medical_reasoning_kormedmcqa",
        "type": "QA + CoT",
    }

if 'kormedlawqa' in dir() and kormedlawqa:
    collected_datasets["medical"]["kormedlawqa"] = {
        "path": f"{RAW_DIR}/kormedlawqa",
        "type": "Medical Law QA",
    }

if 'medical_articles' in dir() and medical_articles:
    collected_datasets["medical"]["wiki_medical_ko"] = {
        "path": f"{RAW_DIR}/wiki_medical_ko",
        "size": len(medical_articles),
        "type": "Medical Wikipedia",
    }

# Save summary
with open(f"{DATA_DIR}/collected_datasets.json", "w", encoding="utf-8") as f:
    json.dump(collected_datasets, f, ensure_ascii=False, indent=2)

print("=" * 60)
print("Data Collection Summary")
print("=" * 60)
print(json.dumps(collected_datasets, indent=2, ensure_ascii=False))

Data Collection Summary
{
  "medical": {
    "kormedmcqa": {
      "path": "../data/raw/kormedmcqa",
      "size": 2494,
      "type": "QA"
    }
  },
  "general_corpus": {
    "tokenizer_corpus": {
      "path": "../data/raw/korean_corpus_for_tokenizer.txt",
      "size_gb": 0.2163737677037716,
      "num_texts": 43566
    }
  }
}


In [14]:
print("\n" + "=" * 60)
print("Korean Medical Data Collection Complete!")
print("=" * 60)
print(f"\nData saved to: {RAW_DIR}")
print("\nNext steps:")
print("  1. Run 03_collect_bilingual_dict.ipynb to create bilingual dictionary")
print("  2. Run 04_preprocess_data.ipynb to prepare training data")


Korean Medical Data Collection Complete!

Data saved to: ../data/raw

Next steps:
  1. Run 03_collect_bilingual_dict.ipynb to create bilingual dictionary
  2. Run 04_preprocess_data.ipynb to prepare training data
