In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import gc
import json
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
import torch.nn as nn
from pathlib import Path
from torch.cuda import Event
from typing import List, Dict, Tuple
from datetime import datetime
import torch.distributed as dist
from xgboost import XGBClassifier
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from torch.nn.parallel import DistributedDataParallel
from transformers import AdamW, get_linear_schedule_with_warmup, get_scheduler, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import re

In [None]:
df = pd.read_csv("/content/drive/MyDrive/데이터/medical_data.csv", encoding = 'utf-8')
df.shape

(48915, 11)

# 고도화된 코드

In [None]:
class TextDataset(Dataset):
    """KM-BERT를 위한 데이터셋"""
    def __init__(self, texts: List[str], labels: np.ndarray,
                 tokenizer, max_length=512):
        self.texts = texts
        self.labels = torch.LongTensor(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': self.labels[idx]
        }

In [None]:
class KMBertModel:
    """KM-BERT 모델 클래스"""
    def __init__(self, num_labels, device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")
        self.model = AutoModelForSequenceClassification.from_pretrained("madatnlp/km-bert",num_labels=num_labels).to(self.device)

    # KMBertModel의 train_and_predict 메서드 수정
    def train_and_predict(self, train_texts, train_labels, val_texts, val_labels, position = 0):
        # 데이터셋 생성
        train_dataset = TextDataset(train_texts, train_labels, self.tokenizer)
        val_dataset = TextDataset(val_texts, val_labels, self.tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)

        # 학습
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(1):
            # 학습 루프
            self.model.train()
            train_pbar = tqdm(train_loader,
                     desc=f'Epoch {epoch+1}/5 [Train]',
                     leave=True,  # 진행바를 유지
                     position=0,   # 진행바 위치
                     ncols=100)    # 진행바 길이
            total_loss = 0

            for batch in train_pbar:
                optimizer.zero_grad()
                outputs = self.model(
                    input_ids=batch['input_ids'].to(self.device),
                    attention_mask=batch['attention_mask'].to(self.device)
                )
                loss = criterion(outputs.logits, batch['labels'].to(self.device))
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                train_pbar.set_postfix({'loss': f'{total_loss/(train_pbar.n+1):.4f}'})

        # 예측
        self.model.eval()
        predictions = []
        val_pbar = tqdm(val_loader,
                   desc='Predicting',
                   leave=False,     # 이전 진행바 지우기
                   position=0,      # 진행바 위치 고정
                   ncols=100,       # 진행바 길이 고정
                   dynamic_ncols=False  # 동적 길이 조정 비활성화
                   )

        with torch.no_grad():
            for batch in val_pbar:
                outputs = self.model(
                    input_ids=batch['input_ids'].to(self.device),
                    attention_mask=batch['attention_mask'].to(self.device)
                )
                preds = torch.softmax(outputs.logits, dim=1)
                predictions.append(preds.cpu().numpy())

        return np.vstack(predictions)

In [None]:
!pip install -q catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

class TabularModel:
    """정형 데이터 처리 모델"""
    def __init__(self):
        self.models = {
            'xgboost': xgb.XGBClassifier(
                n_estimators=200,
                learning_rate=0.1,
                max_depth=7
            ),
            'lightgbm': lgb.LGBMClassifier(
                n_estimators=200,
                learning_rate=0.1,
                verbose = -1
            ),
            'catboost': CatBoostClassifier(
                iterations=200,
                learning_rate=0.1,
                verbose=False
            )
        }
        self.encoders = {}
        self.scalers = {}

    def preprocess(self, data, categorical_cols, numeric_cols, is_training=True):
        data = data.copy()

        # 수치형 특징 처리
        if is_training:
            for col in numeric_cols:
                self.scalers[col] = StandardScaler()
                data[col] = self.scalers[col].fit_transform(data[[col]])
        else:
            for col in numeric_cols:
                data[col] = self.scalers[col].transform(data[[col]])

        # 범주형 특징 처리
        if is_training:
            for col in categorical_cols:
                self.encoders[col] = LabelEncoder()
                data[col] = self.encoders[col].fit_transform(data[col])
        else:
            for col in categorical_cols:
                data[col] = self.encoders[col].transform(data[col])

        return data

    # TabularModel의 train_and_predict 메서드 수정
    def train_and_predict(self, train_data, train_labels, val_data, position=0):
        predictions = {}

        # train_labels가 0부터 시작하는 연속된 정수를 갖도록 보장합니다.
        unique_labels = np.unique(train_labels)
        label_mapping = {label: i for i, label in enumerate(unique_labels)}
        train_labels = np.array([label_mapping[label] for label in train_labels])

        with tqdm(self.models.items(),
                desc="모델 학습 중",
                leave=True,
                position=0,
                ncols=100) as pbar:
            for name, model in pbar:
                pbar.set_postfix({'model': name})

                # 모델이 XGBoost인지 확인하고 목적 함수를 설정합니다.
                if name == 'xgboost':
                    # 다중 클래스 분류가 필요한지 확인합니다.
                    if len(unique_labels) > 2:
                        model.set_params(objective='multi:softprob', num_class=len(unique_labels))
                    else:
                        model.set_params(objective='binary:logistic')

                # val_data.index를 사용하는 대신, TwoStagePredictor 클래스의
                # _get_tabular_predictions 메서드에 제공된 원래 레이블을 사용합니다.
                # 이는 val_data에 대한 레이블이 'labels' 인수로 올바르게 전달되었다고 가정합니다.
                val_labels_original = [label for label in val_data.index]
                val_labels = np.array([label_mapping.get(label, -1) for label in val_labels_original])

                # 레이블이 -1(알 수 없는 레이블)인 행을 필터링합니다.
                valid_indices = np.where(val_labels != -1)[0]
                val_data_filtered = val_data.iloc[valid_indices]
                val_labels_filtered = val_labels[valid_indices]

                model.fit(
                    train_data,
                    train_labels,
                    eval_set=[(val_data_filtered, val_labels_filtered)],  # eval_set에 레이블을 포함합니다.

                )
                predictions[name] = model.predict_proba(val_data)

        return np.mean([pred for pred in predictions.values()], axis=0)

In [None]:
class TwoStagePredictor:
    """2단계 예측 시스템"""
    def __init__(self):
        self.stage1_text_model = None  # KM-BERT for 진료과목코드
        self.stage1_tabular_model = TabularModel()  # ML models for 진료과목코드
        self.stage2_text_model = None  # KM-BERT for 주상병코드
        self.stage2_tabular_model = TabularModel()  # ML models for 주상병코드

        self.stage1_stacker = xgb.XGBClassifier()  # Stack ensemble for 진료과목코드
        self.stage2_stacker = xgb.XGBClassifier()  # Stack ensemble for 주상병코드

    def predict(self, df: pd.DataFrame, test_size=0.2):
        print("\n=== Stage 1: 진료과목코드 예측 ===")
        train_idx = int(len(df) * (1 - test_size))

        # Stage 1 진행바 수정
        with tqdm(total=2,
                 desc="Stage 1 Processing",
                 leave=True,
                 position=0,
                 ncols=100,
                 bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
                 ) as pbar:
            # Text 처리
            stage1_text_preds = self._get_text_predictions(
                df['증상'].values,
                df['진료과목코드'].values,
                train_idx,
                is_stage1=True,
                position=1  # 중첩된 진행바 위치 지정
            )
            pbar.update(1)

            # Tabular 처리
            stage1_tabular_preds = self._get_tabular_predictions(
                df,
                df['진료과목코드'].values,
                train_idx,
                is_stage1=True,
                position=1  # 중첩된 진행바 위치 지정
            )
            pbar.update(1)

        # Stack Ensemble
        print("Performing Stage 1 Stack Ensemble...")
        stage1_features = np.hstack([stage1_text_preds, stage1_tabular_preds])
        self.stage1_stacker.fit(
            stage1_features[:train_idx],
            df['진료과목코드'].values[:train_idx]
        )
        stage1_predictions = self.stage1_stacker.predict(stage1_features[train_idx:])

        # Stage 1 평가
        stage1_metrics = evaluate_multiclass(
            df['진료과목코드'].values[train_idx:],
            stage1_predictions,
            prefix='dept_'
        )

        print("\n=== Stage 2: 주상병코드 예측 ===")
        # 2단계: 주상병코드 예측
        # 1단계 예측 결과를 데이터에 추가
        df_stage2 = df.copy()
        df_stage2.loc[train_idx:, '진료과목코드'] = stage1_predictions

        with tqdm(total=2,
                 desc="Stage 2 Processing",
                 leave=True,
                 position=0,
                 ncols=100,
                 bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
                 ) as pbar:
            stage2_text_preds = self._get_text_predictions(
                df_stage2['증상'].values,
                df_stage2['주상병코드'].values,
                train_idx,
                is_stage1=False,
                position=1  # 중첩된 진행바 위치 지정
            )
            pbar.update(1)

            stage2_tabular_preds = self._get_tabular_predictions(
                df_stage2,
                df_stage2['주상병코드'].values,
                train_idx,
                is_stage1=False,
                position=1  # 중첩된 진행바 위치 지정
            )
            pbar.update(1)

        # Stack Ensemble
        print("Performing Stage 2 Stack Ensemble...")
        stage2_features = np.hstack([stage2_text_preds, stage2_tabular_preds])
        self.stage2_stacker.fit(
            stage2_features[:train_idx],
            df['주상병코드'].values[:train_idx]
        )
        stage2_predictions = self.stage2_stacker.predict(stage2_features[train_idx:])

        # Stage 2 평가
        stage2_metrics = evaluate_multiclass(
            df['주상병코드'].values[train_idx:],
            stage2_predictions,
            prefix='disease_'
        )

        # 결과 저장 및 분석
        results_df, final_metrics = save_and_analyze_results(
            df[train_idx:],
            stage1_predictions,
            stage2_predictions
        )

        return stage1_predictions, stage2_predictions, stage1_metrics, stage2_metrics

    def _get_text_predictions(self, texts, labels, train_idx, is_stage1=True, position=0):
        stage_name = "Stage 1" if is_stage1 else "Stage 2"
        print(f"\nProcessing {stage_name} Text Data with KM-BERT...")

        num_labels = len(np.unique(labels))
        model = KMBertModel(num_labels=num_labels)

        return model.train_and_predict(
            texts[:train_idx],
            labels[:train_idx],
            texts[train_idx:],
            labels[train_idx:],
            position=position  # tqdm 위치 전달
        )

    def _get_tabular_predictions(self, df, labels, train_idx, is_stage1=True, position=0):
        stage_name = "Stage 1" if is_stage1 else "Stage 2"
        print(f"\nProcessing {stage_name} Tabular Data...")

        categorical_cols = ['성별코드', '연령대코드']
        numeric_cols = ['요양일수', '입내원일수', '총처방일수']

        if not is_stage1:
            categorical_cols.append('진료과목코드')

        model = TabularModel()
        processed_data = model.preprocess(
            df[categorical_cols + numeric_cols],
            categorical_cols,
            numeric_cols
        )

        return model.train_and_predict(
            processed_data[:train_idx],
            labels[:train_idx],
            processed_data[train_idx:],
            position=position  # tqdm 위치 전달
        )

def evaluate_multiclass(y_true, y_pred, prefix=''):
    """다중분류 평가 메트릭 계산"""
    return {
        f'{prefix}accuracy': accuracy_score(y_true, y_pred),
        f'{prefix}macro_f1': f1_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_f1': f1_score(y_true, y_pred, average='weighted'),
        f'{prefix}macro_precision': precision_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_precision': precision_score(y_true, y_pred, average='weighted'),
        f'{prefix}macro_recall': recall_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_recall': recall_score(y_true, y_pred, average='weighted')
    }

def save_and_analyze_results(test_df, dept_predictions, disease_predictions):
    """예측 결과 저장 및 분석"""
    results_df = pd.DataFrame({
        'Original_Dept': test_df['진료과목코드'].values,
        'Predicted_Dept': dept_predictions,
        'Original_Disease': test_df['주상병코드'].values,
        'Predicted_Disease': disease_predictions
    })

    # 진료과목 평가
    dept_metrics = evaluate_multiclass(
        results_df['Original_Dept'],
        results_df['Predicted_Dept'],
        prefix='dept_'
    )

    # 주상병 평가
    disease_metrics = evaluate_multiclass(
        results_df['Original_Disease'],
        results_df['Predicted_Disease'],
        prefix='disease_'
    )

    # 결과 저장
    results_df.to_csv('prediction_results.csv', index=False)

    # 메트릭 출력
    print("\n=== Final Results ===")
    print("\n진료과목코드 예측 성능:")
    for metric, value in dept_metrics.items():
        print(f"{metric}: {value:.4f}")

    print("\n주상병코드 예측 성능:")
    for metric, value in disease_metrics.items():
        print(f"{metric}: {value:.4f}")

    # 메트릭 저장
    metrics_df = pd.DataFrame({
        **dept_metrics,
        **disease_metrics
    }, index=[0])
    metrics_df.to_csv('evaluation_metrics.csv', index=False)

    return results_df, {**dept_metrics, **disease_metrics}

In [None]:
def evaluate_multiclass(y_true, y_pred, prefix=''):
    """
    다중분류 평가 메트릭 계산
    """
    return {
        f'{prefix}accuracy': accuracy_score(y_true, y_pred),
        f'{prefix}macro_f1': f1_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_f1': f1_score(y_true, y_pred, average='weighted'),
        f'{prefix}macro_precision': precision_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_precision': precision_score(y_true, y_pred, average='weighted'),
        f'{prefix}macro_recall': recall_score(y_true, y_pred, average='macro'),
        f'{prefix}weighted_recall': recall_score(y_true, y_pred, average='weighted')
    }

In [None]:
def save_and_analyze_results(df, dept_predictions, disease_predictions):
    """
    예측 결과 저장 및 분석
    """
    # 결과 DataFrame 생성
    results_df = pd.DataFrame({
        'Original_Dept': df.loc[int(len(df)*0.8):, '진료과목코드'],
        'Predicted_Dept': dept_predictions,
        'Original_Disease': df.loc[int(len(df)*0.8):, '주상병코드'],
        'Predicted_Disease': disease_predictions
    })

    # 진료과목 예측 평가
    dept_metrics = evaluate_multiclass(
        results_df['Original_Dept'],
        results_df['Predicted_Dept'],
        prefix='dept_'
    )

    # 주상병 예측 평가
    disease_metrics = evaluate_multiclass(
        results_df['Original_Disease'],
        results_df['Predicted_Disease'],
        prefix='disease_'
    )

    # 오류 분석
    dept_errors = results_df[results_df['Original_Dept'] != results_df['Predicted_Dept']]
    disease_errors = results_df[results_df['Original_Disease'] != results_df['Predicted_Disease']]

    # 결과 저장
    results_df.to_csv('prediction_results.csv', index=False)

    # 메트릭 출력
    print("\n=== 진료과목 예측 성능 ===")
    for metric, value in dept_metrics.items():
        print(f"{metric}: {value:.4f}")

    print("\n=== 주상병 예측 성능 ===")
    for metric, value in disease_metrics.items():
        print(f"{metric}: {value:.4f}")

    print(f"\n=== 오류 분석 ===")
    print(f"진료과목 오류 케이스 수: {len(dept_errors)}")
    print(f"주상병 오류 케이스 수: {len(disease_errors)}")

    # 전체 메트릭 저장
    metrics_df = pd.DataFrame({
        **dept_metrics,
        **disease_metrics
    }, index=[0])
    metrics_df.to_csv('evaluation_metrics.csv', index=False)

    return results_df, dept_metrics, disease_metrics


In [None]:
def main():
    # 데이터 로드
    df = pd.read_csv("/content/drive/MyDrive/데이터/medical_data.csv", encoding = 'utf-8')

    # GPU 메모리 정리
    gc.collect()
    torch.cuda.empty_cache()

    # 2단계 예측 실행
    predictor = TwoStagePredictor()
    dept_predictions, disease_predictions = predictor.predict(df)

    # 결과 저장
    results_df = pd.DataFrame({
        'Original_Dept': df.loc[int(len(df)*0.8):, '진료과목코드'],
        'Predicted_Dept': dept_predictions,
        'Original_Disease': df.loc[int(len(df)*0.8):, '주상병코드'],
        'Predicted_Disease': disease_predictions
    })

    results_df.to_csv('prediction_results.csv', index=False)
    return results_df

if __name__ == "__main__":
    main()


=== Stage 1: 진료과목코드 예측 ===


Stage 1 Processing:   0%|                                                            | 0/2 [00:00<?]


Processing Stage 1 Text Data with KM-BERT...


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/104k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/395M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at madatnlp/km-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5 [Train]: 100%|███████████████████████████| 1223/1223 [12:22<00:00,  1.65it/s, loss=2.0221]
Stage 1 Processing:  50%|████████████████████████████                            | 1/2 [13:34<13:34]


Processing Stage 1 Tabular Data...


모델 학습 중:   0%|                                            | 0/3 [00:00<?, ?it/s, model=xgboost]

[0]	validation_0-mlogloss:nan
[1]	validation_0-mlogloss:nan
[2]	validation_0-mlogloss:nan
[3]	validation_0-mlogloss:nan
[4]	validation_0-mlogloss:nan
[5]	validation_0-mlogloss:nan
[6]	validation_0-mlogloss:nan
[7]	validation_0-mlogloss:nan
[8]	validation_0-mlogloss:nan
[9]	validation_0-mlogloss:nan
[10]	validation_0-mlogloss:nan
[11]	validation_0-mlogloss:nan
[12]	validation_0-mlogloss:nan
[13]	validation_0-mlogloss:nan
[14]	validation_0-mlogloss:nan
[15]	validation_0-mlogloss:nan
[16]	validation_0-mlogloss:nan
[17]	validation_0-mlogloss:nan
[18]	validation_0-mlogloss:nan
[19]	validation_0-mlogloss:nan
[20]	validation_0-mlogloss:nan
[21]	validation_0-mlogloss:nan
[22]	validation_0-mlogloss:nan
[23]	validation_0-mlogloss:nan
[24]	validation_0-mlogloss:nan
[25]	validation_0-mlogloss:nan
[26]	validation_0-mlogloss:nan
[27]	validation_0-mlogloss:nan
[28]	validation_0-mlogloss:nan
[29]	validation_0-mlogloss:nan
[30]	validation_0-mlogloss:nan
[31]	validation_0-mlogloss:nan
[32]	validation_0-

모델 학습 중:  33%|███████████▋                       | 1/3 [00:04<00:08,  4.01s/it, model=lightgbm]
Stage 1 Processing:  50%|████████████████████████████                            | 1/2 [13:38<13:38]


ValueError: Input data must be 2 dimensional and non empty.

# 구현 완료

In [None]:
import re

# 데이터 전처리 및 준비
def preprocess_data(data):
    data.dropna(subset=['증상', '진료과목코드', '주상병코드'], inplace=True)
    return data

def clean_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r"[^가-힣a-zA-Z0-9\\s]", "", text)  # 특수문자 제거
    text = re.sub(r"\\s+", " ", text).strip()  # 공백 정리
    return text

# Custom Dataset 정의
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# KM-BERT 임베딩 함수 (Batch 처리)
def get_embeddings_with_dataset(dataset, model, batch_size=64, num_workers=4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    embeddings = []

    for batch in tqdm(dataloader, desc="Generating embeddings"):
        input_ids = batch["input_ids"].squeeze(1).to(device)
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(output)

    return np.vstack(embeddings)

# 모델 학습 및 평가 클래스 정의
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, test_loader, device, num_classes, num_epochs=10):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.device = device
        self.num_classes = num_classes
        self.num_epochs = num_epochs

        # Optimizer and Scheduler
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        num_training_steps = len(train_loader) * self.num_epochs
        num_warmup_steps = num_training_steps // 10
        self.scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        # Loss function
        self.criterion = nn.CrossEntropyLoss()

    def train_epoch(self):
        """한 에폭의 학습을 수행하는 메서드"""
        self.model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        progress_bar = tqdm(self.train_loader, desc="Training")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['label'].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = self.criterion(logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        epoch_loss = total_loss / len(self.train_loader)
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        epoch_f1 = f1_score(all_labels, all_preds, average='weighted')

        return {
            'loss': epoch_loss,
            'accuracy': epoch_accuracy,
            'f1': epoch_f1
        }

    def evaluate(self, dataloader, mode='val'):
        """Validation 또는 Test 평가 메서드"""
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Evaluating ({mode})"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = self.criterion(logits, labels)

                total_loss += loss.item()
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')

        metrics = {
            f'{mode}_loss': avg_loss,
            f'{mode}_accuracy': accuracy,
            f'{mode}_f1': f1
        }

        return metrics

    def train(self):
        """전체 학습 수행"""
        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")

            # Training
            train_metrics = self.train_epoch()
            print(f"Training metrics: {train_metrics}")

            # Validation
            val_metrics = self.evaluate(self.val_loader, mode='val')
            print(f"Validation metrics: {val_metrics}")

        # 최종 Test 평가
        test_metrics = self.evaluate(self.test_loader, mode='test')
        print("\nFinal Test Results:", test_metrics)

        return test_metrics

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

data = preprocess_data(df)

# Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 라벨 인코딩
label_encoder_diagnosis = LabelEncoder()
label_encoder_code = LabelEncoder()
train_data['진료과목코드'] = label_encoder_diagnosis.fit_transform(train_data['진료과목코드'])
test_data['진료과목코드'] = label_encoder_diagnosis.transform(test_data['진료과목코드'])
train_data['주상병코드'] = label_encoder_code.fit_transform(train_data['주상병코드'])
test_data['주상병코드'] = label_encoder_code.transform(test_data['주상병코드'])

In [None]:
#gpu 초기화
gc.collect()
torch.cuda.empty_cache()

In [None]:
# KM-BERT 모델 및 토크나이저 준비
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")
model = AutoModelForSequenceClassification.from_pretrained("madatnlp/km-bert", num_labels=len(label_encoder_diagnosis.classes_))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 텍스트 데이터셋 및 데이터로더
train_dataset = TextDataset(train_data['증상'].tolist(), train_data['진료과목코드'].tolist(), tokenizer)
test_dataset = TextDataset(test_data['증상'].tolist(), test_data['진료과목코드'].tolist(), tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 데이터 확인
for batch in train_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    print(f"KM-BERT Input IDs shape: {input_ids.shape}")
    print(f"KM-BERT Attention Mask shape: {attention_mask.shape}")
    print(f"KM-BERT Labels shape: {labels.shape}")
    break

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at madatnlp/km-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KM-BERT Input IDs shape: torch.Size([64, 512])
KM-BERT Attention Mask shape: torch.Size([64, 512])
KM-BERT Labels shape: torch.Size([64])


In [None]:
# ModelTrainer 초기화 및 학습
trainer = ModelTrainer(
    model=model,
    train_loader=train_dataloader,
    val_loader=test_dataloader,
    test_loader=test_dataloader,
    device=device,
    num_classes=len(label_encoder_diagnosis.classes_),
    num_epochs=1
)

trainer.train()


Epoch 1/1


Training: 100%|██████████| 612/612 [12:00<00:00,  1.18s/it, loss=2.2825]


Training metrics: {'loss': 2.2933085967902263, 'accuracy': 0.26773484616170906, 'f1': 0.22950174206191673}


Evaluating (val): 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]


Validation metrics: {'val_loss': 2.1887322812298544, 'val_accuracy': 0.29479709700500867, 'val_f1': 0.23644439492580255}


Evaluating (test): 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]


Final Test Results: {'test_loss': 2.1887322812298544, 'test_accuracy': 0.29479709700500867, 'test_f1': 0.23644439492580255}





{'test_loss': 2.1887322812298544,
 'test_accuracy': 0.29479709700500867,
 'test_f1': 0.23644439492580255}

In [None]:
# KM-BERT 전체 데이터셋 예측
def predict_kmbert(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting with KM-BERT"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
            predictions.append(probs)
    return np.vstack(predictions)

kmbert_train_probs = predict_kmbert(model, train_dataloader, device)
kmbert_test_probs = predict_kmbert(model, test_dataloader, device)

# KM-BERT 출력 크기 확인
print(f"KM-BERT Train Probs Shape: {kmbert_train_probs.shape}")
print(f"KM-BERT Test Probs Shape: {kmbert_test_probs.shape}")

Predicting with KM-BERT: 100%|██████████| 612/612 [04:05<00:00,  2.49it/s]
Predicting with KM-BERT: 100%|██████████| 153/153 [01:01<00:00,  2.50it/s]

KM-BERT Train Probs Shape: (39132, 18)
KM-BERT Test Probs Shape: (9783, 18)





In [None]:
# 1차 분류: XGBoost 학습
# 정형 데이터 준비
X_train_tabular = train_data[['성별코드', '연령대코드', '요양일수', '입내원일수', '총처방일수']]
X_test_tabular = test_data[['성별코드', '연령대코드', '요양일수', '입내원일수', '총처방일수']]

one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

X_train_tabular = scaler.fit_transform(one_hot_encoder.fit_transform(X_train_tabular))
X_test_tabular = scaler.transform(one_hot_encoder.transform(X_test_tabular))

# 1차 분류: XGBoost
xgb_model = XGBClassifier()
xgb_model.fit(X_train_tabular, train_data['진료과목코드'])
xgb_train_probs = xgb_model.predict_proba(X_train_tabular)
xgb_test_probs = xgb_model.predict_proba(X_test_tabular)

# XGBoost 출력 크기 확인
print(f"XGBoost Train Probs Shape: {xgb_train_probs.shape}")
print(f"XGBoost Test Probs Shape: {xgb_test_probs.shape}")

XGBoost Train Probs Shape: (39132, 18)
XGBoost Test Probs Shape: (9783, 18)


In [None]:
# 1차 분류: Stack Ensemble
stack_train_input = np.hstack([kmbert_train_probs, xgb_train_probs])
stack_test_input = np.hstack([kmbert_test_probs, xgb_test_probs])

# Stack Ensemble 입력 크기 확인
print(f"Stack Train Input Shape: {stack_train_input.shape}")
print(f"Stack Test Input Shape: {stack_test_input.shape}")

stack_model = XGBClassifier()
stack_model.fit(stack_train_input, train_data['진료과목코드'])
stack_preds = stack_model.predict(stack_test_input)

# 1차 분류 성능 평가
print(f"1차 분류 Accuracy: {accuracy_score(test_data['진료과목코드'], stack_preds):.4f}")
# print("Classification Report for 1차 분류:")
# print(classification_report(test_data['진료과목코드'], stack_preds, target_names=label_encoder_diagnosis.classes_))

Stack Train Input Shape: (39132, 36)
Stack Test Input Shape: (9783, 36)
1차 분류 Accuracy: 0.2540


In [None]:
# 2차 분류: 주상병코드 예측
stack_input_2_train = np.hstack([stack_train_input, train_data['진료과목코드'].values.reshape(-1, 1)])
stack_input_2_test = np.hstack([stack_test_input, test_data['진료과목코드'].values.reshape(-1, 1)])

second_model = XGBClassifier()
second_model.fit(stack_input_2_train, train_data['주상병코드'])
y_second_pred = second_model.predict(stack_input_2_test)

print(f"2차 분류 Accuracy: {accuracy_score(test_data['주상병코드'], y_second_pred):.4f}")
# print("Classification Report for 2차 분류:")
# print(classification_report(test_data['주상병코드'], y_second_pred, target_names=label_encoder_code.classes_))

2차 분류 Accuracy: 0.3609


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filekmbert_finetuned_model.pt")

# 스택 모델 및 2차 모델 저장
joblib.dump(stack_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filestack_model_1.pkl")
joblib.dump(second_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filestack_model_2.pkl")

# XGB 모델 저장 (1차 분류용)
joblib.dump(xgb_model, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filexgb_model_for_1st_stage.pkl")

# 인코더 및 스케일러 저장
joblib.dump(label_encoder_diagnosis, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filelabel_encoder_diagnosis.pkl")
joblib.dump(label_encoder_code, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filelabel_encoder_code.pkl")
joblib.dump(one_hot_encoder, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_fileonehot_encoder.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/University/4-2/정보기술학회/data/model_filescaler.pkl")

print("모델 및 인코더 저장 완료.")

모델 및 인코더 저장 완료.
