In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'


Mounted at /content/drive


In [None]:
!pip install iterative-stratification
!pip install pyarrow

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tqdm import tqdm
import os
import gc

In [None]:
# 1. 설정값
n_splits = 5
batch_size = 512
epochs = 30
early_stopping_rounds = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 2. 데이터 로딩
X_train = np.load(BASE_DIR + 'data/embeddings/train_concat.npy').astype(np.float16)
train_meta = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')

assert len(X_train) == len(train_meta), "X_train과 train_meta의 길이가 다릅니다."


y_train = train_meta['generated'].values.astype(np.float32)
groups = train_meta['title'].values

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (1226364, 775), y_train shape: (1226364,)


In [None]:
# 3. PyTorch Dataset 클래스
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X  # np.float16
        self.y = y  # np.float32

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.X[idx])
        y = torch.tensor(self.y[idx], dtype=torch.float32)
        return x, y

In [None]:
class StratifiedGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def split(self, X, y, groups):
        # 그룹을 정수로 인코딩
        if not np.issubdtype(groups.dtype, np.number):
            groups = LabelEncoder().fit_transform(groups)

        # 그룹별 인덱스 저장
        group_to_indices = {}
        for idx, g in enumerate(groups):
            group_to_indices.setdefault(g, []).append(idx)

        unique_groups = np.array(list(group_to_indices.keys()))
        group_y = np.array([
            int(y[group_to_indices[g]].mean() >= 0.5) for g in unique_groups
        ])

        skf = StratifiedKFold(
            n_splits=self.n_splits,
            shuffle=self.shuffle,
            random_state=self.random_state
        )

        for group_train_idx, group_val_idx in skf.split(unique_groups, group_y):
            train_indices, val_indices = [], []

            for gi in group_train_idx:
                train_indices.extend(group_to_indices[unique_groups[gi]])
            for gi in group_val_idx:
                val_indices.extend(group_to_indices[unique_groups[gi]])

            yield np.array(train_indices), np.array(val_indices)

In [None]:
# 4. MLP 모델 클래스
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64,1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
# 5. GroupKFold + 학습 + 로그 저장
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))

all_logs = []  # 전체 로그 저장용 리스트
y_train=y_train.astype(np.float32)
for fold, (train_idx, val_idx) in enumerate(sgkf.split(X_train, y_train, groups)):
    print(f'\n=== Fold {fold+1}/{n_splits} 학습 시작 ===')

    X_tr, y_tr = X_train[train_idx], y_train[train_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    train_dataset = CustomDataset(X_tr, y_tr)
    val_dataset = CustomDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=2,pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,num_workers=2,pin_memory=True)

    model = MLP(input_dim=X_train.shape[1]).to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',           # val_auc가 증가해야 하므로 'max'
    factor=0.5,           # lr을 절반으로 줄임
    patience=3,           # 3 epoch 동안 개선 없으면 감소
    verbose=True,
    min_lr=1e-6           # 최소 학습률 하한
    )


    best_auc = 0
    patience_counter = 0
    fold_log = []  # 이 fold의 epoch별 로그 저장용 리스트

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device).float(), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb.view(-1,1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_preds = []
        with torch.no_grad():
            for xb, _ in val_loader:
                xb = xb.to(device).float()
                pred = model(xb).squeeze()
                val_preds.extend(pred.cpu().numpy())

        auc = roc_auc_score(y_val, val_preds)
        avg_train_loss = train_loss / len(train_loader)

        print(f"Fold {fold+1} | Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val AUC: {auc:.4f}")

        # LR Scheduler 업데이트
        scheduler.step(auc)

        # 로그 저장
        fold_log.append({
            'fold': fold + 1,
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_auc': auc
        })

        # Early Stopping
        if auc > best_auc:
            best_auc = auc
            patience_counter = 0
            torch.save(model.state_dict(), BASE_DIR + f'model/mlp_bce/mlp_fold{fold+1}.pt')
            print(f"Fold {fold+1} 모델 저장 (Best AUC: {best_auc:.4f})")
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_rounds:
                print(f"Early Stopping (patience {early_stopping_rounds} 도달)")
                break

    # Fold별 로그 저장
    all_logs.extend(fold_log)

    # Fold OOF
    oof_preds[val_idx] = val_preds

    del model, train_loader, val_loader, train_dataset, val_dataset
    torch.cuda.empty_cache()
    gc.collect()




=== Fold 1/5 학습 시작 ===




Fold 1 | Epoch 1 | Train Loss: 0.2461 | Val AUC: 0.6958
✅ Fold 1 모델 저장 (Best AUC: 0.6958)
Fold 1 | Epoch 2 | Train Loss: 0.2277 | Val AUC: 0.7024
✅ Fold 1 모델 저장 (Best AUC: 0.7024)
Fold 1 | Epoch 3 | Train Loss: 0.2247 | Val AUC: 0.7024
Fold 1 | Epoch 4 | Train Loss: 0.2224 | Val AUC: 0.7062
✅ Fold 1 모델 저장 (Best AUC: 0.7062)
Fold 1 | Epoch 5 | Train Loss: 0.2202 | Val AUC: 0.7097
✅ Fold 1 모델 저장 (Best AUC: 0.7097)
Fold 1 | Epoch 6 | Train Loss: 0.2181 | Val AUC: 0.7095
Fold 1 | Epoch 7 | Train Loss: 0.2167 | Val AUC: 0.7099
✅ Fold 1 모델 저장 (Best AUC: 0.7099)
Fold 1 | Epoch 8 | Train Loss: 0.2147 | Val AUC: 0.7104
✅ Fold 1 모델 저장 (Best AUC: 0.7104)
Fold 1 | Epoch 9 | Train Loss: 0.2134 | Val AUC: 0.7094
Fold 1 | Epoch 10 | Train Loss: 0.2113 | Val AUC: 0.7123
✅ Fold 1 모델 저장 (Best AUC: 0.7123)
Fold 1 | Epoch 11 | Train Loss: 0.2099 | Val AUC: 0.7088
Fold 1 | Epoch 12 | Train Loss: 0.2086 | Val AUC: 0.7107
Fold 1 | Epoch 13 | Train Loss: 0.2066 | Val AUC: 0.7083
❌ Early Stopping (patience 3 도



Fold 2 | Epoch 1 | Train Loss: 0.2474 | Val AUC: 0.6939
✅ Fold 2 모델 저장 (Best AUC: 0.6939)
Fold 2 | Epoch 2 | Train Loss: 0.2287 | Val AUC: 0.6975
✅ Fold 2 모델 저장 (Best AUC: 0.6975)
Fold 2 | Epoch 3 | Train Loss: 0.2257 | Val AUC: 0.6988
✅ Fold 2 모델 저장 (Best AUC: 0.6988)
Fold 2 | Epoch 4 | Train Loss: 0.2235 | Val AUC: 0.7025
✅ Fold 2 모델 저장 (Best AUC: 0.7025)
Fold 2 | Epoch 5 | Train Loss: 0.2215 | Val AUC: 0.7041
✅ Fold 2 모델 저장 (Best AUC: 0.7041)
Fold 2 | Epoch 6 | Train Loss: 0.2197 | Val AUC: 0.7025
Fold 2 | Epoch 7 | Train Loss: 0.2179 | Val AUC: 0.7055
✅ Fold 2 모델 저장 (Best AUC: 0.7055)
Fold 2 | Epoch 8 | Train Loss: 0.2159 | Val AUC: 0.7046
Fold 2 | Epoch 9 | Train Loss: 0.2142 | Val AUC: 0.7063
✅ Fold 2 모델 저장 (Best AUC: 0.7063)
Fold 2 | Epoch 10 | Train Loss: 0.2128 | Val AUC: 0.7016
Fold 2 | Epoch 11 | Train Loss: 0.2111 | Val AUC: 0.7024
Fold 2 | Epoch 12 | Train Loss: 0.2094 | Val AUC: 0.7095
✅ Fold 2 모델 저장 (Best AUC: 0.7095)
Fold 2 | Epoch 13 | Train Loss: 0.2079 | Val AUC: 0.7



Fold 3 | Epoch 1 | Train Loss: 0.2476 | Val AUC: 0.7001
✅ Fold 3 모델 저장 (Best AUC: 0.7001)
Fold 3 | Epoch 2 | Train Loss: 0.2301 | Val AUC: 0.7059
✅ Fold 3 모델 저장 (Best AUC: 0.7059)
Fold 3 | Epoch 3 | Train Loss: 0.2265 | Val AUC: 0.7059
Fold 3 | Epoch 4 | Train Loss: 0.2246 | Val AUC: 0.7124
✅ Fold 3 모델 저장 (Best AUC: 0.7124)
Fold 3 | Epoch 5 | Train Loss: 0.2227 | Val AUC: 0.7109
Fold 3 | Epoch 6 | Train Loss: 0.2209 | Val AUC: 0.7124
Fold 3 | Epoch 7 | Train Loss: 0.2192 | Val AUC: 0.7166
✅ Fold 3 모델 저장 (Best AUC: 0.7166)
Fold 3 | Epoch 8 | Train Loss: 0.2174 | Val AUC: 0.7133
Fold 3 | Epoch 9 | Train Loss: 0.2160 | Val AUC: 0.7080
Fold 3 | Epoch 10 | Train Loss: 0.2145 | Val AUC: 0.7118
❌ Early Stopping (patience 3 도달)

=== Fold 4/5 학습 시작 ===




Fold 4 | Epoch 1 | Train Loss: 0.2460 | Val AUC: 0.7011
✅ Fold 4 모델 저장 (Best AUC: 0.7011)
Fold 4 | Epoch 2 | Train Loss: 0.2282 | Val AUC: 0.7091
✅ Fold 4 모델 저장 (Best AUC: 0.7091)
Fold 4 | Epoch 3 | Train Loss: 0.2250 | Val AUC: 0.6995
Fold 4 | Epoch 4 | Train Loss: 0.2229 | Val AUC: 0.7028
Fold 4 | Epoch 5 | Train Loss: 0.2210 | Val AUC: 0.7082
❌ Early Stopping (patience 3 도달)

=== Fold 5/5 학습 시작 ===




Fold 5 | Epoch 1 | Train Loss: 0.2440 | Val AUC: 0.7029
✅ Fold 5 모델 저장 (Best AUC: 0.7029)
Fold 5 | Epoch 2 | Train Loss: 0.2269 | Val AUC: 0.6996
Fold 5 | Epoch 3 | Train Loss: 0.2239 | Val AUC: 0.7073
✅ Fold 5 모델 저장 (Best AUC: 0.7073)
Fold 5 | Epoch 4 | Train Loss: 0.2218 | Val AUC: 0.7092
✅ Fold 5 모델 저장 (Best AUC: 0.7092)
Fold 5 | Epoch 5 | Train Loss: 0.2198 | Val AUC: 0.7100
✅ Fold 5 모델 저장 (Best AUC: 0.7100)
Fold 5 | Epoch 6 | Train Loss: 0.2185 | Val AUC: 0.7157
✅ Fold 5 모델 저장 (Best AUC: 0.7157)
Fold 5 | Epoch 7 | Train Loss: 0.2167 | Val AUC: 0.7180
✅ Fold 5 모델 저장 (Best AUC: 0.7180)
Fold 5 | Epoch 8 | Train Loss: 0.2148 | Val AUC: 0.7135
Fold 5 | Epoch 9 | Train Loss: 0.2133 | Val AUC: 0.7137
Fold 5 | Epoch 10 | Train Loss: 0.2118 | Val AUC: 0.7133
❌ Early Stopping (patience 3 도달)


In [None]:
# 6. 전체 OOF AUC
final_auc = roc_auc_score(y_train, oof_preds)
print(f"\n전체 OOF AUC: {final_auc:.4f}")

# 6-1. 전체 OOF 결과를 로그에 추가
log_df = pd.DataFrame(all_logs)
log_df = pd.concat([
    log_df,
    pd.DataFrame([{
        'fold': 0,
        'epoch': 0,
        'train_loss': np.nan,
        'val_auc': final_auc
    }])
], ignore_index=True)

# 7. 최종 로그 CSV 저장
log_df.to_csv(BASE_DIR + 'logs/deep_mlp_training_log_5.csv', index=False, encoding='utf-8')
print(f"전체 학습 로그 저장 완료: {BASE_DIR}logs/deep_mlp_training_log_1.csv")


✅ 전체 OOF AUC: 0.7039
✅ 전체 학습 로그 저장 완료: /content/drive/MyDrive/Dacon_FakeText/logs/deep_mlp_training_log_1.csv
