# Phase 0.3: Create Bilingual Medical Dictionary

Build English-Korean bilingual dictionary for WECHSEL-style embedding alignment.

## Contents
1. Core Medical Terminology
2. Body Parts and Anatomy
3. Diseases and Conditions
4. Symptoms and Signs
5. Treatments and Procedures
6. Medical Professionals
7. Validate and Save Dictionary

In [None]:
# Setup
import sys
import os
import json
sys.path.append("..")

DATA_DIR = "../data"
DICT_DIR = f"{DATA_DIR}/bilingual_dict"
os.makedirs(DICT_DIR, exist_ok=True)

print(f"Dictionary directory: {DICT_DIR}")

---
## 1. Core Medical Terminology

In [None]:
# Core medical terms (English -> Korean)
core_medical = {
    # General medical terms
    "medicine": "의학",
    "medical": "의료",
    "health": "건강",
    "healthcare": "의료",
    "clinical": "임상",
    "diagnosis": "진단",
    "treatment": "치료",
    "therapy": "치료법",
    "prognosis": "예후",
    "symptom": "증상",
    "syndrome": "증후군",
    "disease": "질병",
    "disorder": "장애",
    "condition": "상태",
    "illness": "질환",
    "infection": "감염",
    "inflammation": "염증",
    "chronic": "만성",
    "acute": "급성",
    "benign": "양성",
    "malignant": "악성",
    "contagious": "전염성",
    "hereditary": "유전성",
    "genetic": "유전적",
    "congenital": "선천성",
    "acquired": "후천성",
    "primary": "일차",
    "secondary": "이차",
    "complication": "합병증",
    "side effect": "부작용",
    "adverse effect": "이상반응",
}

print(f"Core medical terms: {len(core_medical)}")

---
## 2. Body Parts and Anatomy

In [None]:
body_parts = {
    # Major organs
    "heart": "심장",
    "lung": "폐",
    "liver": "간",
    "kidney": "신장",
    "brain": "뇌",
    "stomach": "위",
    "intestine": "장",
    "small intestine": "소장",
    "large intestine": "대장",
    "colon": "결장",
    "rectum": "직장",
    "pancreas": "췌장",
    "spleen": "비장",
    "gallbladder": "담낭",
    "bladder": "방광",
    "uterus": "자궁",
    "ovary": "난소",
    "prostate": "전립선",
    "thyroid": "갑상선",
    
    # Cardiovascular
    "blood": "혈액",
    "blood vessel": "혈관",
    "artery": "동맥",
    "vein": "정맥",
    "capillary": "모세혈관",
    "aorta": "대동맥",
    
    # Musculoskeletal
    "bone": "뼈",
    "muscle": "근육",
    "joint": "관절",
    "cartilage": "연골",
    "tendon": "힘줄",
    "ligament": "인대",
    "spine": "척추",
    "vertebra": "척추뼈",
    "skull": "두개골",
    "rib": "갈비뼈",
    
    # Nervous system
    "nerve": "신경",
    "spinal cord": "척수",
    "neuron": "뉴런",
    
    # Skin and external
    "skin": "피부",
    "hair": "모발",
    "nail": "손톱",
    
    # Sensory organs
    "eye": "눈",
    "ear": "귀",
    "nose": "코",
    "tongue": "혀",
    "throat": "목",
    "larynx": "후두",
    "pharynx": "인두",
    "trachea": "기관",
    "esophagus": "식도",
    
    # Other
    "cell": "세포",
    "tissue": "조직",
    "organ": "장기",
    "gland": "선",
    "lymph node": "림프절",
}

print(f"Body parts: {len(body_parts)}")

---
## 3. Diseases and Conditions

In [None]:
diseases = {
    # Cancer
    "cancer": "암",
    "tumor": "종양",
    "carcinoma": "암종",
    "leukemia": "백혈병",
    "lymphoma": "림프종",
    "melanoma": "흑색종",
    "breast cancer": "유방암",
    "lung cancer": "폐암",
    "colon cancer": "대장암",
    "stomach cancer": "위암",
    "liver cancer": "간암",
    "pancreatic cancer": "췌장암",
    
    # Cardiovascular
    "heart disease": "심장병",
    "heart attack": "심장마비",
    "myocardial infarction": "심근경색",
    "stroke": "뇌졸중",
    "hypertension": "고혈압",
    "hypotension": "저혈압",
    "arrhythmia": "부정맥",
    "atherosclerosis": "동맥경화",
    "aneurysm": "동맥류",
    "heart failure": "심부전",
    
    # Metabolic
    "diabetes": "당뇨병",
    "type 1 diabetes": "제1형 당뇨병",
    "type 2 diabetes": "제2형 당뇨병",
    "obesity": "비만",
    "hyperthyroidism": "갑상선기능항진증",
    "hypothyroidism": "갑상선기능저하증",
    "gout": "통풍",
    
    # Respiratory
    "pneumonia": "폐렴",
    "bronchitis": "기관지염",
    "asthma": "천식",
    "tuberculosis": "결핵",
    "COPD": "만성폐쇄성폐질환",
    "emphysema": "폐기종",
    
    # Infectious
    "influenza": "독감",
    "flu": "독감",
    "cold": "감기",
    "COVID-19": "코로나19",
    "hepatitis": "간염",
    "hepatitis A": "A형 간염",
    "hepatitis B": "B형 간염",
    "hepatitis C": "C형 간염",
    "HIV": "HIV",
    "AIDS": "에이즈",
    "meningitis": "뇌수막염",
    "sepsis": "패혈증",
    
    # Gastrointestinal
    "gastritis": "위염",
    "ulcer": "궤양",
    "gastric ulcer": "위궤양",
    "appendicitis": "맹장염",
    "pancreatitis": "췌장염",
    "cirrhosis": "간경변",
    "gallstones": "담석",
    
    # Neurological
    "Alzheimer's disease": "알츠하이머병",
    "Parkinson's disease": "파킨슨병",
    "epilepsy": "간질",
    "seizure": "발작",
    "migraine": "편두통",
    "multiple sclerosis": "다발성경화증",
    
    # Musculoskeletal
    "arthritis": "관절염",
    "osteoarthritis": "골관절염",
    "rheumatoid arthritis": "류마티스 관절염",
    "osteoporosis": "골다공증",
    "fracture": "골절",
    "herniated disc": "디스크",
    
    # Mental health
    "depression": "우울증",
    "anxiety": "불안",
    "schizophrenia": "조현병",
    "bipolar disorder": "양극성 장애",
    "PTSD": "외상 후 스트레스 장애",
    
    # Autoimmune
    "lupus": "루푸스",
    "autoimmune disease": "자가면역질환",
    
    # Allergies
    "allergy": "알레르기",
    "anaphylaxis": "아나필락시스",
}

print(f"Diseases: {len(diseases)}")

---
## 4. Symptoms and Signs

In [None]:
symptoms = {
    # General symptoms
    "pain": "통증",
    "fever": "발열",
    "fatigue": "피로",
    "weakness": "쇠약",
    "dizziness": "어지러움",
    "nausea": "메스꺼움",
    "vomiting": "구토",
    "diarrhea": "설사",
    "constipation": "변비",
    "swelling": "부종",
    "bleeding": "출혈",
    "bruise": "멍",
    "rash": "발진",
    "itching": "가려움",
    
    # Head/Neurological
    "headache": "두통",
    "migraine": "편두통",
    "confusion": "혼란",
    "memory loss": "기억 상실",
    "numbness": "저림",
    "tingling": "따끔거림",
    "paralysis": "마비",
    
    # Respiratory
    "cough": "기침",
    "shortness of breath": "호흡곤란",
    "wheezing": "천명",
    "sore throat": "인후통",
    "runny nose": "콧물",
    "congestion": "코막힘",
    "sneezing": "재채기",
    
    # Cardiovascular
    "chest pain": "흉통",
    "palpitation": "두근거림",
    "rapid heartbeat": "빠른 심장박동",
    
    # Gastrointestinal
    "abdominal pain": "복통",
    "bloating": "복부팽만",
    "indigestion": "소화불량",
    "heartburn": "속쓰림",
    "loss of appetite": "식욕부진",
    
    # Urinary
    "frequent urination": "빈뇨",
    "painful urination": "배뇨통",
    "blood in urine": "혈뇨",
    
    # Other
    "weight loss": "체중감소",
    "weight gain": "체중증가",
    "insomnia": "불면증",
    "sweating": "발한",
    "chills": "오한",
}

print(f"Symptoms: {len(symptoms)}")

---
## 5. Treatments and Procedures

In [None]:
treatments = {
    # General treatments
    "surgery": "수술",
    "operation": "수술",
    "procedure": "시술",
    "transplant": "이식",
    "transfusion": "수혈",
    "biopsy": "생검",
    
    # Medications
    "medication": "약물",
    "drug": "약",
    "medicine": "약",
    "prescription": "처방",
    "dosage": "용량",
    "antibiotic": "항생제",
    "antiviral": "항바이러스제",
    "painkiller": "진통제",
    "analgesic": "진통제",
    "anti-inflammatory": "항염증제",
    "steroid": "스테로이드",
    "vaccine": "백신",
    "vaccination": "예방접종",
    "immunization": "면역",
    "chemotherapy": "화학요법",
    "radiation therapy": "방사선치료",
    "hormone therapy": "호르몬 치료",
    
    # Procedures
    "injection": "주사",
    "infusion": "주입",
    "IV": "정맥주사",
    "dialysis": "투석",
    "endoscopy": "내시경",
    "colonoscopy": "대장내시경",
    "gastroscopy": "위내시경",
    "MRI": "MRI",
    "CT scan": "CT",
    "X-ray": "엑스레이",
    "ultrasound": "초음파",
    "ECG": "심전도",
    "EKG": "심전도",
    
    # Surgical procedures
    "appendectomy": "맹장수술",
    "bypass surgery": "우회술",
    "cesarean section": "제왕절개",
    "laparoscopy": "복강경수술",
    
    # Other
    "physical therapy": "물리치료",
    "rehabilitation": "재활",
    "occupational therapy": "작업치료",
    "psychotherapy": "심리치료",
}

print(f"Treatments: {len(treatments)}")

---
## 6. Medical Professionals and Settings

In [None]:
professionals = {
    # Healthcare providers
    "doctor": "의사",
    "physician": "의사",
    "nurse": "간호사",
    "surgeon": "외과의사",
    "pharmacist": "약사",
    "dentist": "치과의사",
    "psychiatrist": "정신과의사",
    "psychologist": "심리학자",
    "therapist": "치료사",
    "radiologist": "방사선과의사",
    "anesthesiologist": "마취과의사",
    "pediatrician": "소아과의사",
    "cardiologist": "심장전문의",
    "neurologist": "신경과전문의",
    "dermatologist": "피부과전문의",
    "ophthalmologist": "안과의사",
    "oncologist": "종양전문의",
    
    # Roles
    "patient": "환자",
    "caregiver": "간병인",
    "specialist": "전문의",
    
    # Settings
    "hospital": "병원",
    "clinic": "의원",
    "emergency room": "응급실",
    "ICU": "중환자실",
    "operating room": "수술실",
    "pharmacy": "약국",
    "laboratory": "검사실",
    "ward": "병동",
    
    # Specialties
    "internal medicine": "내과",
    "surgery": "외과",
    "pediatrics": "소아과",
    "obstetrics": "산과",
    "gynecology": "부인과",
    "orthopedics": "정형외과",
    "neurology": "신경과",
    "cardiology": "심장내과",
    "dermatology": "피부과",
    "ophthalmology": "안과",
    "otolaryngology": "이비인후과",
    "urology": "비뇨기과",
    "psychiatry": "정신과",
    "radiology": "방사선과",
    "emergency medicine": "응급의학과",
    "anesthesiology": "마취과",
    "oncology": "종양학",
}

print(f"Professionals and settings: {len(professionals)}")

---
## 7. Lab Values and Measurements

In [None]:
lab_values = {
    # Blood tests
    "blood test": "혈액검사",
    "blood pressure": "혈압",
    "blood sugar": "혈당",
    "blood count": "혈구수",
    "hemoglobin": "헤모글로빈",
    "red blood cell": "적혈구",
    "white blood cell": "백혈구",
    "platelet": "혈소판",
    "cholesterol": "콜레스테롤",
    "triglyceride": "중성지방",
    
    # Measurements
    "temperature": "체온",
    "pulse": "맥박",
    "heart rate": "심박수",
    "respiratory rate": "호흡수",
    "oxygen saturation": "산소포화도",
    "BMI": "체질량지수",
    "weight": "체중",
    "height": "키",
    
    # Other labs
    "urine test": "소변검사",
    "stool test": "대변검사",
    "biopsy": "조직검사",
    "culture": "배양검사",
}

print(f"Lab values: {len(lab_values)}")

---
## 8. Combine and Validate Dictionary

In [None]:
# Combine all dictionaries
bilingual_dict = {}
bilingual_dict.update(core_medical)
bilingual_dict.update(body_parts)
bilingual_dict.update(diseases)
bilingual_dict.update(symptoms)
bilingual_dict.update(treatments)
bilingual_dict.update(professionals)
bilingual_dict.update(lab_values)

print(f"Total bilingual entries: {len(bilingual_dict)}")

# Check for duplicates
korean_values = list(bilingual_dict.values())
unique_korean = set(korean_values)
print(f"Unique Korean translations: {len(unique_korean)}")

if len(korean_values) != len(unique_korean):
    print("\nDuplicate Korean values found (same translation for different English terms):")
    from collections import Counter
    duplicates = [item for item, count in Counter(korean_values).items() if count > 1]
    for dup in duplicates[:10]:
        en_terms = [k for k, v in bilingual_dict.items() if v == dup]
        print(f"  {dup}: {en_terms}")

In [None]:
# Create category-organized dictionary
categorized_dict = {
    "core_medical": core_medical,
    "body_parts": body_parts,
    "diseases": diseases,
    "symptoms": symptoms,
    "treatments": treatments,
    "professionals": professionals,
    "lab_values": lab_values,
}

# Statistics
print("\nDictionary by category:")
for cat, terms in categorized_dict.items():
    print(f"  {cat}: {len(terms)} terms")

---
## 9. Save Dictionary

In [None]:
# Save flat dictionary (for embedding alignment)
flat_dict_path = f"{DICT_DIR}/bilingual_medical_dict.json"
with open(flat_dict_path, "w", encoding="utf-8") as f:
    json.dump(bilingual_dict, f, ensure_ascii=False, indent=2)
print(f"Saved flat dictionary to {flat_dict_path}")

# Save categorized dictionary
categorized_path = f"{DICT_DIR}/bilingual_medical_dict_categorized.json"
with open(categorized_path, "w", encoding="utf-8") as f:
    json.dump(categorized_dict, f, ensure_ascii=False, indent=2)
print(f"Saved categorized dictionary to {categorized_path}")

# Save reverse dictionary (Korean -> English)
reverse_dict = {v: k for k, v in bilingual_dict.items()}
reverse_path = f"{DICT_DIR}/bilingual_medical_dict_ko_en.json"
with open(reverse_path, "w", encoding="utf-8") as f:
    json.dump(reverse_dict, f, ensure_ascii=False, indent=2)
print(f"Saved reverse dictionary to {reverse_path}")

In [None]:
# Create summary
summary = {
    "total_entries": len(bilingual_dict),
    "categories": {cat: len(terms) for cat, terms in categorized_dict.items()},
    "files": {
        "flat": flat_dict_path,
        "categorized": categorized_path,
        "reverse": reverse_path,
    },
}

summary_path = f"{DICT_DIR}/dictionary_summary.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\n" + "=" * 60)
print("Bilingual Dictionary Summary")
print("=" * 60)
print(json.dumps(summary, indent=2, ensure_ascii=False))

In [None]:
print("\n" + "=" * 60)
print("Bilingual Dictionary Creation Complete!")
print("=" * 60)
print(f"\nDictionary saved to: {DICT_DIR}")
print(f"Total entries: {len(bilingual_dict)}")
print("\nNext steps:")
print("  1. Run 04_preprocess_data.ipynb to prepare training data")
print("  2. This dictionary will be used for WECHSEL-style embedding initialization")