In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'
SAVE_PATH = BASE_DIR + 'data/embeddings/train_concat.npy'

Mounted at /content/drive


In [None]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tqdm import tqdm
import os

In [None]:
# 1. 설정값
n_splits = 5
batch_size = 512
epochs = 30
early_stopping_rounds = 4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 2. 데이터 로딩
X_train = np.load(BASE_DIR + 'data/embeddings/train_concat.npy')
train_meta = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')
y_train = train_meta['generated'].values
groups = train_meta['title'].values

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (1226364, 775), y_train shape: (1226364,)


In [None]:
# 3. PyTorch Dataset 클래스
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
# Focal Loss with Logits 정의
class FocalLossWithLogits(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super(FocalLossWithLogits, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        # targets: 0~1 float (binary labels)
        targets = targets.view(-1, 1)  # (batch_size, 1)

        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')  # 이미 내부적으로 sigmoid 포함됨
        pred_prob = torch.sigmoid(logits)  # pt 확률값 구하기
        pt = pred_prob * targets + (1 - pred_prob) * (1 - targets)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        else:
            return focal_loss.sum()

In [None]:
class StratifiedGroupKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def split(self, X, y, groups):
        # 그룹을 정수로 인코딩
        if not np.issubdtype(groups.dtype, np.number):
            groups = LabelEncoder().fit_transform(groups)

        # 그룹별 인덱스 저장
        group_to_indices = {}
        for idx, g in enumerate(groups):
            group_to_indices.setdefault(g, []).append(idx)

        unique_groups = np.array(list(group_to_indices.keys()))
        group_y = np.array([
            int(y[group_to_indices[g]].mean() >= 0.5) for g in unique_groups
        ])

        skf = StratifiedKFold(
            n_splits=self.n_splits,
            shuffle=self.shuffle,
            random_state=self.random_state
        )

        for group_train_idx, group_val_idx in skf.split(unique_groups, group_y):
            train_indices, val_indices = [], []

            for gi in group_train_idx:
                train_indices.extend(group_to_indices[unique_groups[gi]])
            for gi in group_val_idx:
                val_indices.extend(group_to_indices[unique_groups[gi]])

            yield np.array(train_indices), np.array(val_indices)

In [None]:
# 4. MLP 모델 클래스
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64,1)
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',           # val_auc가 증가해야 하므로 'max'
    factor=0.5,           # lr을 절반으로 줄임
    patience=3,           # 2 epoch 동안 개선 없으면 감소
    verbose=True,
    min_lr=1e-5           # 최소 학습률 하한
)



In [None]:
# 5. GroupKFold + 학습 + 로그 저장
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))

all_logs = []  # 전체 로그 저장용 리스트
y_train=y_train.astype(np.float32)
for fold, (train_idx, val_idx) in enumerate(sgkf.split(X_train, y_train, groups)):
    print(f'\n=== Fold {fold+1}/{n_splits} 학습 시작 ===')

    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    train_dataset = CustomDataset(X_tr, y_tr)
    val_dataset = CustomDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = MLP(input_dim=X_train.shape[1]).to(device)
    criterion = FocalLossWithLogits(alpha=0.25, gamma=2.0)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

    best_auc = 0
    patience = 0
    fold_log = []  # 이 fold의 epoch별 로그 저장용 리스트

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb.view(-1,1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_preds = []
        with torch.no_grad():
            for xb, _ in val_loader:
                xb = xb.to(device)
                pred = model(xb).squeeze()
                pred = torch.sigmoid(pred)
                val_preds.extend(pred.cpu().numpy())

        auc = roc_auc_score(y_val, val_preds)
        avg_train_loss = train_loss / len(train_loader)

        print(f"Fold {fold+1} | Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val AUC: {auc:.4f}")

        # LR Scheduler 업데이트
        scheduler.step(auc)

        # 로그 저장
        fold_log.append({
            'fold': fold + 1,
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_auc': auc
        })

        # Early Stopping
        if auc > best_auc:
            best_auc = auc
            patience = 0
            torch.save(model.state_dict(), BASE_DIR + f'model/mlp_fold{fold+1}.pt')
            print(f"Fold {fold+1} 모델 저장 (Best AUC: {best_auc:.4f})")
        else:
            patience += 1
            if patience >= early_stopping_rounds:
                print(f"Early Stopping (patience {early_stopping_rounds} 도달)")
                break

    # Fold별 로그 저장
    all_logs.extend(fold_log)

    # Fold OOF
    oof_preds[val_idx] = val_preds




=== Fold 1/5 학습 시작 ===
Fold 1 | Epoch 1 | Train Loss: 0.0187 | Val AUC: 0.6951
✅ Fold 1 모델 저장 (Best AUC: 0.6951)
Fold 1 | Epoch 2 | Train Loss: 0.0163 | Val AUC: 0.7010
✅ Fold 1 모델 저장 (Best AUC: 0.7010)
Fold 1 | Epoch 3 | Train Loss: 0.0159 | Val AUC: 0.7099
✅ Fold 1 모델 저장 (Best AUC: 0.7099)
Fold 1 | Epoch 4 | Train Loss: 0.0157 | Val AUC: 0.7137
✅ Fold 1 모델 저장 (Best AUC: 0.7137)
Fold 1 | Epoch 5 | Train Loss: 0.0156 | Val AUC: 0.7143
✅ Fold 1 모델 저장 (Best AUC: 0.7143)
Fold 1 | Epoch 6 | Train Loss: 0.0156 | Val AUC: 0.7166
✅ Fold 1 모델 저장 (Best AUC: 0.7166)
Fold 1 | Epoch 7 | Train Loss: 0.0156 | Val AUC: 0.7201
✅ Fold 1 모델 저장 (Best AUC: 0.7201)
Fold 1 | Epoch 8 | Train Loss: 0.0155 | Val AUC: 0.7236
✅ Fold 1 모델 저장 (Best AUC: 0.7236)
Fold 1 | Epoch 9 | Train Loss: 0.0155 | Val AUC: 0.7217
Fold 1 | Epoch 10 | Train Loss: 0.0154 | Val AUC: 0.7238
✅ Fold 1 모델 저장 (Best AUC: 0.7238)
Fold 1 | Epoch 11 | Train Loss: 0.0154 | Val AUC: 0.7250
✅ Fold 1 모델 저장 (Best AUC: 0.7250)
Fold 1 | Epoch 12 

In [None]:
# 6. 전체 OOF AUC
final_auc = roc_auc_score(y_train, oof_preds)
print(f"\n전체 OOF AUC: {final_auc:.4f}")

# 6-1. 전체 OOF 결과를 로그에 추가
log_df = pd.DataFrame(all_logs)
log_df = pd.concat([
    log_df,
    pd.DataFrame([{
        'fold': 0,
        'epoch': 0,
        'train_loss': None,
        'val_auc': final_auc
    }])
], ignore_index=True)

# 7. 최종 로그 CSV 저장
log_df.to_csv(BASE_DIR + 'logs/deep_mlp_training_log_1.csv', index=False, encoding='utf-8')
print(f"전체 학습 로그 저장 완료: {BASE_DIR}logs/deep_mlp_training_log_1.csv")


✅ 전체 OOF AUC: 0.7298
✅ 전체 학습 로그 저장 완료: /content/drive/MyDrive/Dacon_FakeText/logs/deep_mlp_training_log_1.csv


  log_df = pd.concat([
