In [None]:
import pandas as pd
import numpy as np
import os
import warnings
import logging
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from tqdm import tqdm
import json
from datetime import datetime
import gc
from torch.cuda.amp import autocast, GradScaler
from torch.utils.checkpoint import checkpoint
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
import gc
from torch.cuda import Event

In [None]:
df = pd.read_csv("/content/drive/MyDrive/University/4-2/정보기술학회/data/medical_data.csv", encoding = "utf-8")
df.shape

(2891197, 11)

In [None]:
# 진료과목코드별 빈도수 확인 및 1000개 이상인 코드만 필터링
dept_counts = df['진료과목코드'].value_counts()
valid_depts = dept_counts[dept_counts >= 1000].index.tolist()

# 유효한 진료과목코드만 필터링
filtered_df = df[df['진료과목코드'].isin(valid_depts)]

# 각 진료과목코드별로 1000개씩 샘플링
balanced_dfs = []
for dept in valid_depts:
    dept_sample = filtered_df[filtered_df['진료과목코드'] == dept].sample(n=1000, random_state=42)
    balanced_dfs.append(dept_sample)

# 최종 균형잡힌 데이터셋 생성
final_df = pd.concat(balanced_dfs, ignore_index=True)

# 결과 확인
print("최종 데이터셋 크기:", len(final_df))
print("\n각 진료과목코드별 샘플 수:")
print(final_df['진료과목코드'].value_counts())

최종 데이터셋 크기: 17000

각 진료과목코드별 샘플 수:
진료과목코드
1     1000
13    1000
12    1000
11    1000
23    1000
4     1000
24    1000
2     1000
5     1000
6     1000
0     1000
21    1000
9     1000
7     1000
14    1000
10    1000
16    1000
Name: count, dtype: int64


In [None]:
final_df = pd.read_csv("/content/drive/MyDrive/University/4-2/정보기술학회/data/final_df.csv", encoding = "utf-8")
final_df.shape

(17000, 11)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import re

class MedicalDataPreprocessor:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("madatnlp/km-bert")

    def clean_text(self, text):
        """의료 도메인 특성을 고려한 텍스트 전처리"""
        # 불필요한 특수문자 제거하되 의미있는 구분자(.,)는 보존
        text = re.sub(r'[^가-힣a-zA-Z0-9.,\s]', ' ', str(text))
        # 중복된 공백 제거
        text = re.sub(r'\s+', ' ', text).strip()
        # 중복된 마침표 정리
        text = re.sub(r'\.+', '.', text)
        return text

    def prepare_dataset(self, df):
        """데이터셋 준비"""
        # 텍스트 전처리
        df['cleaned_symptoms'] = df['증상'].apply(self.clean_text)

        # 레이블 인코딩
        unique_departments = sorted(df['진료과목코드'].unique())
        self.dept_to_idx = {dept: idx for idx, dept in enumerate(unique_departments)}
        df['label'] = df['진료과목코드'].map(self.dept_to_idx)

        # 계층화된 train/val/test 분할 (8:1:1)
        train_df, temp_df = train_test_split(
            df, test_size=0.2, stratify=df['label'], random_state=42
        )
        val_df, test_df = train_test_split(
            temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
        )

        return train_df, val_df, test_df

class MedicalDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.texts = df['cleaned_symptoms'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 토큰화 및 패딩
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def create_dataloaders(train_df, val_df, test_df, tokenizer, batch_size=64):
    """데이터로더 생성"""
    train_dataset = MedicalDataset(train_df, tokenizer)
    val_dataset = MedicalDataset(val_df, tokenizer)
    test_dataset = MedicalDataset(test_df, tokenizer)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    return train_loader, val_loader, test_loader

# 사용 예시
preprocessor = MedicalDataPreprocessor()
train_df, val_df, test_df = preprocessor.prepare_dataset(final_df)  # final_df는 이전 단계에서 생성한 균형잡힌 데이터셋

train_loader, val_loader, test_loader = create_dataloaders(
    train_df, val_df, test_df,
    preprocessor.tokenizer
)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
from tqdm import tqdm
import wandb

class MedicalBertClassifier(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.3):
        super().__init__()
        # KM-BERT 모델 로드
        self.bert = AutoModel.from_pretrained("madatnlp/km-bert")

        # 분류를 위한 추가 레이어
        self.dropout = nn.Dropout(dropout_rate)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size // 2, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        # BERT 출력
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        # 분류
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, test_loader, device, num_classes,
                 patience=3, min_delta=0.001):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.device = device
        self.num_classes = num_classes

        # Early stopping parameters
        self.patience = patience
        self.min_delta = min_delta
        self.best_val_f1 = 0
        self.patience_counter = 0

        # Learning components initialization
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        num_training_steps = len(train_loader) * 10
        num_warmup_steps = num_training_steps // 10
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        # Weights & Biases initialization
        wandb.init(project="medical-department-classification")

    def train_epoch(self):
        """한 에폭의 학습을 수행하는 메서드"""
        self.model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        progress_bar = tqdm(self.train_loader, desc="Training")
        for batch in progress_bar:
            # 데이터를 GPU로 이동
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['label'].to(self.device)

            # Forward pass와 loss 계산
            outputs = self.model(input_ids, attention_mask)
            loss = F.cross_entropy(outputs, labels)

            # Backward pass와 최적화
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()

            # 메트릭 수집
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

            # 진행 상황 업데이트
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        # 에폭 단위 메트릭 계산
        epoch_loss = total_loss / len(self.train_loader)
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        epoch_f1 = f1_score(all_labels, all_preds, average='weighted')

        # 메트릭을 wandb에 기록
        wandb.log({
            'train_loss': epoch_loss,
            'train_accuracy': epoch_accuracy,
            'train_f1': epoch_f1
        })

        return {
            'loss': epoch_loss,
            'accuracy': epoch_accuracy,
            'f1': epoch_f1
        }

    def evaluate(self, dataloader, mode='val'):
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []
        all_probs = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Evaluating ({mode})"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(input_ids, attention_mask)
                loss = F.cross_entropy(outputs, labels)

                probs = F.softmax(outputs, dim=1)
                preds = torch.argmax(outputs, dim=1)

                total_loss += loss.item()
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())

        # 종합 메트릭 계산
        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')

        # ROC-AUC 계산 (one-vs-rest)
        roc_auc = roc_auc_score(
            np.eye(self.num_classes)[all_labels],
            np.array(all_probs),
            multi_class='ovr',
            average='weighted'
        )

        # Confusion Matrix
        conf_matrix = confusion_matrix(all_labels, all_preds)

        metrics = {
            f'{mode}_loss': avg_loss,
            f'{mode}_accuracy': accuracy,
            f'{mode}_f1': f1,
            f'{mode}_roc_auc': roc_auc
        }

        # Weights & Biases에 로깅
        wandb.log(metrics)

        return metrics, conf_matrix

    def train(self, num_epochs=10):
        best_val_f1 = 0
        early_stop = False

        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")

            # Training
            train_metrics = self.train_epoch()
            print(f"Training metrics: {train_metrics}")

            # Validation
            val_metrics, val_conf_matrix = self.evaluate(self.val_loader, mode='val')
            print(f"Validation metrics: {val_metrics}")

            current_val_f1 = val_metrics['val_f1']

            # Early stopping logic
            if current_val_f1 > self.best_val_f1 + self.min_delta:
                # 성능 향상이 있는 경우
                self.best_val_f1 = current_val_f1
                self.patience_counter = 0

                # 향상된 모델 저장
                torch.save(self.model.state_dict(), 'best_model.pt')
                print(f"New best model saved! Validation F1: {current_val_f1:.4f}")

                # 모델 메타데이터 저장
                model_metadata = {
                    'epoch': epoch + 1,
                    'val_f1': current_val_f1,
                    'val_accuracy': val_metrics['val_accuracy'],
                    'val_loss': val_metrics['val_loss'],
                    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                with open('model_metadata.json', 'w') as f:
                    json.dump(model_metadata, f, indent=4)

            else:
                # 성능 향상이 없는 경우
                self.patience_counter += 1
                print(f"No improvement in validation F1. Patience: {self.patience_counter}/{self.patience}")

                if self.patience_counter >= self.patience:
                    print("\nEarly stopping triggered!")
                    print(f"Best validation F1: {self.best_val_f1:.4f}")
                    early_stop = True
                    break

            # wandb에 로깅
            wandb.log({
                'epoch': epoch + 1,
                'patience_counter': self.patience_counter,
                'best_val_f1': self.best_val_f1
            })

        # 학습 완료 후 처리
        if early_stop:
            print("\nTraining stopped early due to no improvement in validation F1")
        else:
            print("\nTraining completed for all epochs")

        # 최종 평가를 위해 best 모델 로드
        print("\nLoading best model for final evaluation...")
        self.model.load_state_dict(torch.load('best_model.pt'))
        test_metrics, test_conf_matrix = self.evaluate(self.test_loader, mode='test')
        print("\nFinal Test Results:", test_metrics)

        return test_metrics, test_conf_matrix

In [None]:
# 학습 실행
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(preprocessor.dept_to_idx)  # 이전 코드에서 정의된 preprocessor 사용

model = MedicalBertClassifier(num_classes=num_classes).to(device)
trainer = ModelTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    device=device,
    num_classes=num_classes,
    patience=3,  # Stop if no improvement for 3 epochs
    min_delta=0.001  # Minimum improvement required
)

test_metrics, test_conf_matrix = trainer.train(num_epochs=10)

0,1
best_val_f1,▁▃███
epoch,▁▃▅▆█
patience_counter,▁▁▁▅█
test_accuracy,▁
test_f1,▁
test_loss,▁
test_roc_auc,▁
train_accuracy,▁▇████
train_f1,▁▇████
train_loss,█▂▂▁▁▁

0,1
best_val_f1,0.23643
epoch,5.0
patience_counter,2.0
test_accuracy,0.28235
test_f1,0.23806
test_loss,2.21661
test_roc_auc,0.76511
train_accuracy,0.27324
train_f1,0.2387
train_loss,2.20487



Epoch 1/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.5803]


Training metrics: {'loss': 2.6553469543725674, 'accuracy': 0.17161764705882354, 'f1': 0.15626367299149388}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.3629452034279153, 'val_accuracy': 0.2752941176470588, 'val_f1': 0.2048727895114093, 'val_roc_auc': 0.7449136029411765}
New best model saved! Validation F1: 0.2049

Epoch 2/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=1.9838]


Training metrics: {'loss': 2.3071634584749248, 'accuracy': 0.26323529411764707, 'f1': 0.22756081757540736}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.2662588711138123, 'val_accuracy': 0.27705882352941175, 'val_f1': 0.2094570143161646, 'val_roc_auc': 0.75823125}
New best model saved! Validation F1: 0.2095

Epoch 3/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.4600]


Training metrics: {'loss': 2.2517077687760474, 'accuracy': 0.2701470588235294, 'f1': 0.2363241764339537}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.2369064622455173, 'val_accuracy': 0.28, 'val_f1': 0.2113294363028019, 'val_roc_auc': 0.7626194852941178}
New best model saved! Validation F1: 0.2113

Epoch 4/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.2954]


Training metrics: {'loss': 2.2332052305830477, 'accuracy': 0.27595588235294116, 'f1': 0.24031887114133932}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.2259222489816173, 'val_accuracy': 0.2782352941176471, 'val_f1': 0.22333453148697277, 'val_roc_auc': 0.7658919117647058}
New best model saved! Validation F1: 0.2233

Epoch 5/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=1.9035]


Training metrics: {'loss': 2.21730607216347, 'accuracy': 0.27683823529411766, 'f1': 0.24581321509165832}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.222959845154374, 'val_accuracy': 0.27705882352941175, 'val_f1': 0.22654779796763883, 'val_roc_auc': 0.7690011029411765}
New best model saved! Validation F1: 0.2265

Epoch 6/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.3131]


Training metrics: {'loss': 2.2127111433817186, 'accuracy': 0.27330882352941177, 'f1': 0.24191522463194506}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.218348834249708, 'val_accuracy': 0.2817647058823529, 'val_f1': 0.22248497832250033, 'val_roc_auc': 0.7687323529411765}
No improvement in validation F1. Patience: 1/3

Epoch 7/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.2001]


Training metrics: {'loss': 2.2105909513195914, 'accuracy': 0.2716911764705882, 'f1': 0.24002198133676164}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.213340198552167, 'val_accuracy': 0.2847058823529412, 'val_f1': 0.22772377384953826, 'val_roc_auc': 0.7694687499999998}
New best model saved! Validation F1: 0.2277

Epoch 8/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.2517]


Training metrics: {'loss': 2.2031891547458273, 'accuracy': 0.2758088235294118, 'f1': 0.24390716247463778}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.212839192814297, 'val_accuracy': 0.28411764705882353, 'val_f1': 0.2250889510818152, 'val_roc_auc': 0.7695933823529413}
No improvement in validation F1. Patience: 1/3

Epoch 9/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=1.9573]


Training metrics: {'loss': 2.20091785623434, 'accuracy': 0.2757352941176471, 'f1': 0.2458459651400451}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.212196288285432, 'val_accuracy': 0.28294117647058825, 'val_f1': 0.2229444050932327, 'val_roc_auc': 0.7690988970588236}
No improvement in validation F1. Patience: 2/3

Epoch 10/10


Training: 100%|██████████| 213/213 [04:01<00:00,  1.13s/it, loss=2.3785]


Training metrics: {'loss': 2.1968427700615827, 'accuracy': 0.2757352941176471, 'f1': 0.2464425810116824}


Evaluating (val): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Validation metrics: {'val_loss': 2.2120639041618064, 'val_accuracy': 0.28058823529411764, 'val_f1': 0.23095083495996885, 'val_roc_auc': 0.7699485294117648}
New best model saved! Validation F1: 0.2310

Training completed for all epochs

Loading best model for final evaluation...


  self.model.load_state_dict(torch.load('best_model.pt'))
Evaluating (test): 100%|██████████| 27/27 [00:09<00:00,  2.80it/s]


Final Test Results: {'test_loss': 2.2008956052638866, 'test_accuracy': 0.2811764705882353, 'test_f1': 0.23037198319437865, 'test_roc_auc': 0.7700801470588235}



