In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import logging
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from tqdm import tqdm
import json
from datetime import datetime
import gc
from torch.cuda.amp import autocast, GradScaler
from torch.utils.checkpoint import checkpoint
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
import gc
from torch.cuda import Event

In [2]:
df = pd.read_csv("/content/drive/MyDrive/University/4-2/정보기술학회/data/medical_data.csv", encoding = "utf-8")
df.shape

(2891197, 11)

In [3]:
# Step 1: 진료과목코드별 빈도수 계산
code_counts = df["진료과목코드"].value_counts()

# Step 2: 1000개 이하의 진료과목코드는 제외
valid_codes = code_counts[code_counts >= 1000].index

# Step 3: 1000개 이상인 진료과목코드에서 각각 1000개씩 랜덤 추출
filtered_df = pd.DataFrame()
for code in valid_codes:
    samples = df[df["진료과목코드"] == code].sample(n=1000, random_state=42)
    filtered_df = pd.concat([filtered_df, samples])

# Step 4: 최종 데이터프레임 확인
df = filtered_df.reset_index(drop=True)

In [4]:
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 훈련 세트와 검증 세트 분리
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

print(f"훈련 세트 크기: {len(train_df)}")
print(f"검증 세트 크기: {len(val_df)}")
print(f"테스트 세트 크기: {len(test_df)}")

# 7. 클래스 분포 확인
print("\n진료과목코드 분포:")
print(train_df['진료과목코드'].value_counts())
print("\n주상병코드 분포:")
print(train_df['주상병코드'].value_counts())

훈련 세트 크기: 10200
검증 세트 크기: 3400
테스트 세트 크기: 3400

진료과목코드 분포:
진료과목코드
1     625
23    620
24    611
14    608
5     606
10    605
7     603
9     602
13    602
12    602
0     602
2     599
21    594
16    587
11    583
6     576
4     575
Name: count, dtype: int64

주상병코드 분포:
주상병코드
J20    1715
I63    1260
K29     920
J00     782
J06     666
J30     548
A09     506
J02     392
K21     376
J03     326
H10     244
K64     214
K30     191
J01     184
K59     182
J04     165
J18     153
H04     114
K58     112
I20     110
J45      89
H52      87
J32      87
H66      86
H00      85
J21      81
H01      72
H25      65
H60      58
K25      58
H35      56
H16      50
J36      42
I48      30
H11      26
J31      24
H65      22
J37      13
A04       9
Name: count, dtype: int64


In [None]:
# def create_balanced_dataset(df, samples_per_class=1000, min_samples=1000):
#     """
#     '진료과목코드'와 '주상병코드'의 조합별로 샘플 수를 제한하고,
#     min_samples 개수 미만인 샘플은 드랍하여 균형 잡힌 데이터셋을 생성합니다.

#     Args:
#         df (pd.DataFrame): 원본 데이터프레임.
#         samples_per_class (int): 각 조합별로 가져올 최대 샘플 수. 기본값은 1000입니다.
#         min_samples (int): 각 조합별로 가져올 최소 샘플 수. 기본값은 1000입니다.

#     Returns:
#         pd.DataFrame: 균형 잡힌 데이터프레임.
#     """
#     balanced_data = []

#     # '진료과목코드'와 '주상병코드'의 고유한 조합을 가져옵니다.
#     unique_combinations = df[['진료과목코드', '주상병코드']].drop_duplicates()

#     # 각 조합별로 샘플을 추출합니다.
#     for _, row in unique_combinations.iterrows():
#         dept_code = row['진료과목코드']
#         disease_code = row['주상병코드']

#         # 현재 조합에 해당하는 데이터를 필터링합니다.
#         filtered_df = df[(df['진료과목코드'] == dept_code) & (df['주상병코드'] == disease_code)]

#         # min_samples 개수 이상인 경우에만 샘플링합니다.
#         if len(filtered_df) >= min_samples:
#             # samples_per_class만큼 샘플을 추출합니다. replace=False로 설정하여 중복 샘플링을 방지합니다.
#             sampled_df = filtered_df.sample(n=samples_per_class, replace=False)
#             balanced_data.append(sampled_df)

#     # 추출된 샘플들을 하나의 데이터프레임으로 합칩니다.
#     balanced_df = pd.concat(balanced_data, ignore_index=True)

#     return balanced_df

# # 균형 잡힌 데이터셋을 생성합니다.
# balanced_df = create_balanced_dataset(df, samples_per_class=1000, min_samples=1000)

# # 6. 훈련/검증/테스트 세트 분리 (60/20/20)
# # 먼저 테스트 세트 분리
# train_val_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42,
#                                         stratify=balanced_df[['진료과목코드', '주상병코드']])

# # 훈련 세트와 검증 세트 분리
# train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42,  # 0.25 = 20/80
#                                    stratify=train_val_df[['진료과목코드', '주상병코드']])

# print(f"훈련 세트 크기: {len(train_df)}")
# print(f"검증 세트 크기: {len(val_df)}")
# print(f"테스트 세트 크기: {len(test_df)}")

# # 7. 클래스 분포 확인
# print("\n진료과목코드 분포:")
# print(train_df['진료과목코드'].value_counts())
# print("\n주상병코드 분포:")
# print(train_df['주상병코드'].value_counts())

In [5]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(10200, 11)
(3400, 11)
(3400, 11)


In [6]:
# CUDA 사용 가능 여부 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 랜덤 시드 설정
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(42)

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm


class MedicalDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

        # 레이블 인코딩 및 관계 매핑
        self.dept_encoder = LabelEncoder()
        self.disease_encoder = LabelEncoder()
        self.dept_labels = self.dept_encoder.fit_transform(df['진료과목코드'])
        self.disease_labels = self.disease_encoder.fit_transform(df['주상병코드'])

        # tabular_features 초기화 추가
        self.tabular_features = self._prepare_tabular_features()

        # 진료과-질병 관계 매핑 생성
        self.dept_disease_mapping = {}
        for dept in df['진료과목코드'].unique():
            dept_idx = self.dept_encoder.transform([dept])[0]
            dept_diseases = df[df['진료과목코드'] == dept]['주상병코드'].unique()
            self.dept_disease_mapping[dept_idx] = self.disease_encoder.transform(dept_diseases)

    def _prepare_tabular_features(self):
        """테이블 형식 데이터의 정규화"""
        numerical_features = ['연령대코드', '요양일수', '입내원일수', '총처방일수']
        categorical_features = ['성별코드']

        # 수치형 데이터 정규화
        numerical_data = self.df[numerical_features].astype(float)
        normalized_numerical = (numerical_data - numerical_data.mean()) / numerical_data.std()

        # 범주형 데이터 원-핫 인코딩
        categorical_data = pd.get_dummies(self.df[categorical_features])

        # 모든 특성 결합
        return pd.concat([normalized_numerical, categorical_data], axis=1)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # 증상 텍스트 전처리
        text = str(self.df.iloc[idx]['증상'])

        # KM-BERT 토크나이징
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze().contiguous(),
            'attention_mask': encoding['attention_mask'].squeeze().contiguous(),
            'tabular': torch.FloatTensor(self.tabular_features.iloc[idx].values).contiguous(),
            'dept_label': torch.LongTensor([self.dept_labels[idx]])[0].contiguous(),
            'disease_label': torch.LongTensor([self.disease_labels[idx]])[0].contiguous()
        }

In [16]:
class MultiModalMedicalModel(nn.Module):
    def __init__(self, num_dept_classes, disease_classes_per_dept, num_disease_classes, tabular_size):
        super().__init__()

        # BERT config 적용 및 경량화
        self.bert_config = {
            "attention_probs_dropout_prob": 0.1,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "hidden_size": 768,  # 768 -> 384로 축소
            "initializer_range": 0.02,
            "intermediate_size": 3072,  # 3072 -> 1536으로 축소
            "max_position_embeddings": 512,
            "num_attention_heads": 12,   # 8 -> 6으로 축소
            "num_hidden_layers": 12,     # 6 -> 4로 축소
            "type_vocab_size": 2,
            "vocab_size": 26986,
            "layer_norm_eps": 1e-12
        }

        # BERT 초기화 및 설정
        bert_config_obj = AutoConfig.from_pretrained("madatnlp/km-bert")
        for key, value in self.bert_config.items():
            setattr(bert_config_obj, key, value)

        self.bert = AutoModel.from_pretrained("madatnlp/km-bert", config=bert_config_obj, ignore_mismatched_sizes=True)
        self.bert.gradient_checkpointing_enable()
        self.hidden_size = self.bert_config["hidden_size"]

        # 나머지 레이어들은 동일하게 유지
        self.tabular_network = nn.Sequential(
            nn.Linear(tabular_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size, eps=self.bert_config["layer_norm_eps"]),
            nn.Dropout(self.bert_config["hidden_dropout_prob"])
        )

        # 특성 결합 레이어
        self.fusion_layer = nn.Sequential(
            nn.Linear(self.hidden_size * 2, self.hidden_size),
            nn.LayerNorm(self.hidden_size, eps=self.bert_config["layer_norm_eps"]),
            nn.GELU(),
            nn.Dropout(self.bert_config["hidden_dropout_prob"])
        )

        # 진료과 분류기
        self.dept_classifier = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(self.bert_config["hidden_dropout_prob"]),
            nn.Linear(self.hidden_size, num_dept_classes)
        )

        # 진료과별 질병 분류기
        self.disease_classifier_shared = nn.Sequential(
            nn.Linear(self.hidden_size + num_dept_classes, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(self.bert_config["hidden_dropout_prob"])
        )

        self.disease_classifiers_output = nn.ModuleDict({
            str(dept_id): nn.Linear(self.hidden_size, num_diseases)
            for dept_id, num_diseases in disease_classes_per_dept.items()
        })

        # 진료과 임베딩
        self.dept_embedding = nn.Embedding(num_dept_classes, num_dept_classes)

        # 파라미터 초기화
        self.apply(self._init_weights)

        # 클래스 수 저장
        self.num_dept_classes = num_dept_classes
       # 최대 질병 클래스 수 계산
        self.max_disease_classes = max(disease_classes_per_dept.values())
        print(f"Max disease classes: {self.max_disease_classes}")

        # 진료과별 질병 분류기 수정 - Linear 레이어 하나로 통일
        self.disease_classifiers_output = nn.ModuleDict({
            str(dept_id): nn.Linear(self.hidden_size, self.max_disease_classes)
            for dept_id in disease_classes_per_dept.keys()
        })

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=self.bert_config["initializer_range"])
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, tabular):
        # BERT 처리
        with torch.cuda.amp.autocast():
            bert_outputs = self.bert(
                input_ids,
                attention_mask,
                output_hidden_states=False
            )
        text_features = bert_outputs.last_hidden_state[:, 0, :]
        del bert_outputs

        # 테이블 데이터 처리
        tabular_features = self.tabular_network(tabular)

        # 특성 결합
        combined = torch.cat([text_features, tabular_features], dim=1)
        fused_features = self.fusion_layer(combined)

        # 진료과 분류
        dept_logits = self.dept_classifier(fused_features)
        dept_probs = F.softmax(dept_logits, dim=1)

        # 진료과별 질병 분류
        batch_size = fused_features.size(0)
        all_disease_logits = []

        for dept_id in range(self.num_dept_classes):
            dept_embed = self.dept_embedding(torch.tensor([dept_id], device=fused_features.device))
            dept_embed = dept_embed.expand(batch_size, -1)

            dept_specific_features = torch.cat([
                fused_features,
                dept_probs * dept_embed
            ], dim=1)

            # 공유 레이어를 통한 처리
            shared_features = self.disease_classifier_shared(dept_specific_features)

            # 진료과별 출력 레이어
            dept_disease_logits = self.disease_classifiers_output[str(dept_id)](shared_features)

            # 디버그 출력 추가
            print(f"Dept {dept_id} logits shape: {dept_disease_logits.shape}")

            all_disease_logits.append(dept_disease_logits.unsqueeze(1))

            del dept_specific_features, shared_features

        # 모든 진료과의 질병 예측을 결합
        disease_logits = torch.cat(all_disease_logits, dim=1)
        print(f"Combined disease logits shape: {disease_logits.shape}")

        final_disease_logits = torch.sum(disease_logits * dept_probs.unsqueeze(-1), dim=1)
        print(f"Final disease logits shape: {final_disease_logits.shape}")

        return dept_logits, final_disease_logits

In [17]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

class HierarchicalFocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.ce = nn.CrossEntropyLoss(reduction='none')

    def forward(self, dept_logits, disease_logits, dept_labels, disease_labels):
        # 차원 확인 및 디버그 출력
        print(f"dept_logits shape: {dept_logits.shape}")
        print(f"disease_logits shape: {disease_logits.shape}")
        print(f"dept_labels shape: {dept_labels.shape}")
        print(f"disease_labels shape: {disease_labels.shape}")

        # 레이블 유효성 검사
        assert torch.all(dept_labels >= 0) and torch.all(dept_labels < dept_logits.size(1)), \
            f"Invalid dept_labels. Range: [{dept_labels.min()}, {dept_labels.max()}], Expected max: {dept_logits.size(1)-1}"
        assert torch.all(disease_labels >= 0) and torch.all(disease_labels < disease_logits.size(1)), \
            f"Invalid disease_labels. Range: [{disease_labels.min()}, {disease_labels.max()}], Expected max: {disease_logits.size(1)-1}"

        # 진료과 분류 loss 계산
        dept_ce = self.ce(dept_logits, dept_labels)
        dept_pt = torch.exp(-dept_ce)
        dept_loss = self.alpha * (1 - dept_pt) ** self.gamma * dept_ce

        # 질병 분류 loss 계산
        disease_ce = self.ce(disease_logits, disease_labels)
        disease_pt = torch.exp(-disease_ce)
        disease_loss = (1 - self.alpha) * (1 - disease_pt) ** self.gamma * disease_ce

        # 전체 loss 계산
        total_loss = (dept_loss.mean() + disease_loss.mean()) / 2
        return total_loss

In [18]:
import warnings
warnings.filterwarnings("ignore")

class PerformanceTracker:
    def __init__(self):
        self.start = torch.cuda.Event(enable_timing=True)
        self.end = torch.cuda.Event(enable_timing=True)
        torch.cuda.synchronize()  # 동기화 추가
        self.start.record()

    def step(self, batch_size):
        self.end.record()
        torch.cuda.synchronize()  # 동기화 추가
        elapsed = self.start.elapsed_time(self.end) / 1000.0
        throughput = batch_size / elapsed if elapsed > 0 else 0
        self.start.record()
        return throughput


def create_optimized_dataloader(dataset, batch_size=16, shuffle=True):  # 배치 크기 축소
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=4,  # 워커 수 감소
        pin_memory=True,
        prefetch_factor=1,
        persistent_workers=True,
        drop_last=True
    )

def train_model(model, train_loader, val_loader, num_epochs=1):
    criterion = HierarchicalFocalLoss(gamma=2.0)
    optimizer = torch.optim.AdamW([
        {'params': model.bert.parameters(), 'lr': 2e-5},
        {'params': [p for n, p in model.named_parameters() if 'bert' not in n], 'lr': 1e-4}
    ])

    scaler = GradScaler()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    best_accuracy = 0

    def run_epoch(dataloader, mode='train'):
        model.train() if mode == 'train' else model.eval()
        total_loss = 0
        all_dept_preds, all_disease_preds = [], []
        all_dept_labels, all_disease_labels = [], []

        for batch_idx, batch in enumerate(tqdm(dataloader, desc=f"{mode.capitalize()}")):
            try:
                # 데이터 전처리
                batch = {k: v.cuda() for k, v in batch.items()}

                # 레이블 유효성 검사
                assert torch.all(batch['dept_label'] >= 0), "Negative dept_label found"
                assert torch.all(batch['disease_label'] >= 0), "Negative disease_label found"

                with torch.cuda.amp.autocast():
                    dept_logits, disease_logits = model(
                        batch['input_ids'],
                        batch['attention_mask'],
                        batch['tabular']
                    )

                    # 예측 결과 유효성 검사
                    assert not torch.isnan(dept_logits).any(), "NaN in dept_logits"
                    assert not torch.isnan(disease_logits).any(), "NaN in disease_logits"

                    loss = criterion(
                        dept_logits,
                        disease_logits,
                        batch['dept_label'],
                        batch['disease_label']
                    )

                if mode == 'train':
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad(set_to_none=True)

                # 메트릭 계산
                with torch.no_grad():
                    dept_preds = torch.argmax(dept_logits, dim=1).cpu()
                    disease_preds = torch.argmax(disease_logits, dim=1).cpu()

                    all_dept_preds.extend(dept_preds.numpy())
                    all_disease_preds.extend(disease_preds.numpy())
                    all_dept_labels.extend(batch['dept_label'].cpu().numpy())
                    all_disease_labels.extend(batch['disease_label'].cpu().numpy())

                total_loss += loss.item()

            except Exception as e:
                print(f"Error in batch {batch_idx}:")
                print(f"Exception: {str(e)}")
                print("Batch shapes:")
                for k, v in batch.items():
                    print(f"{k}: {v.shape}")
                continue

            if batch_idx % 10 == 0:
                torch.cuda.empty_cache()

        metrics = {
            'loss': total_loss / len(dataloader),
            'dept_acc': accuracy_score(all_dept_labels, all_dept_preds),
            'disease_acc': accuracy_score(all_disease_labels, all_disease_preds)
        }

        return metrics

In [19]:
def main():
    # CUDA 설정
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.empty_cache()

    # 디버그를 위한 CUDA 설정
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

    print(f"Using device: {device}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        print(f"Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")

    # KR-BERT 토크나이저 초기화
    tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")

    # 데이터셋 생성
    train_dataset = MedicalDataset(train_df, tokenizer)
    val_dataset = MedicalDataset(val_df, tokenizer)

    # 데이터로더 생성
    train_loader = create_optimized_dataloader(train_dataset, batch_size=8)
    val_loader = create_optimized_dataloader(val_dataset, batch_size=8, shuffle=False)

    # 테이블 데이터의 특성 수 계산
    tabular_size = len(train_dataset.tabular_features.columns)

    # BERT config 수정
    bert_config = AutoConfig.from_pretrained("madatnlp/km-bert")
    bert_config.vocab_size = 16424  # KR-BERT의 vocab size로 변경

    # disease_classes_per_dept 생성
    disease_classes_per_dept = {}
    for dept in train_dataset.dept_encoder.classes_:
        dept_idx = train_dataset.dept_encoder.transform([dept])[0]
        dept_mask = train_df['진료과목코드'] == dept
        unique_diseases = train_df[dept_mask]['주상병코드'].nunique()
        disease_classes_per_dept[str(dept_idx)] = unique_diseases
        print(f"Dept {dept} (idx {dept_idx}): {unique_diseases} diseases")

    try:
        # 모델 초기화
        model = MultiModalMedicalModel(
            num_dept_classes=len(train_dataset.dept_encoder.classes_),
            disease_classes_per_dept=disease_classes_per_dept,
            num_disease_classes=len(train_dataset.disease_encoder.classes_),
            tabular_size=tabular_size
        )

        # 모델 구조 출력
        print("\nModel structure:")
        print(model)

        # GPU 메모리 상태 확인
        print("\nBefore moving to GPU:")
        print(f"Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")

        # 모델을 GPU로 이동
        model = model.to(device)

        print("\nAfter moving to GPU:")
        print(f"Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")

        # 모델 학습
        train_model(model, train_loader, val_loader)

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("\nCurrent GPU memory status:")
        print(f"Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
        print(f"Max memory allocated: {torch.cuda.max_memory_allocated()/1e9:.2f}GB")
        raise e

    finally:
        # 메모리 정리
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    main()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
