In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'
SAVE_PATH = BASE_DIR + 'data/embeddings/train_concat.npy'

Mounted at /content/drive


In [None]:
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

In [None]:
# 1. 설정값
n_splits = 5
batch_size = 256
epochs = 30
early_stopping_rounds = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 2. 데이터 로딩
X_train = np.load(BASE_DIR + 'data/embeddings/train_concat.npy')
train_meta = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')
y_train = train_meta['generated'].values
groups = train_meta['title'].values

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (1226364, 775), y_train shape: (1226364,)


In [None]:
# 3. PyTorch Dataset 클래스
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
# 4. MLP 모델 클래스
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
# 5. GroupKFold + 학습 + 로그 저장
gkf = GroupKFold(n_splits=n_splits)
oof_preds = np.zeros(len(X_train))

all_logs = []  # 전체 로그 저장용 리스트

for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train, y_train, groups)):
    print(f'\n=== Fold {fold+1}/{n_splits} 학습 시작 ===')

    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    train_dataset = CustomDataset(X_tr, y_tr)
    val_dataset = CustomDataset(X_val, y_val)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = MLP(input_dim=X_train.shape[1]).to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    best_auc = 0
    patience = 0
    fold_log = []  # 이 fold의 epoch별 로그 저장용 리스트

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb).squeeze()
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_preds = []
        with torch.no_grad():
            for xb, _ in val_loader:
                xb = xb.to(device)
                pred = model(xb).squeeze()
                val_preds.extend(pred.cpu().numpy())

        auc = roc_auc_score(y_val, val_preds)
        avg_train_loss = train_loss / len(train_loader)

        print(f"Fold {fold+1} | Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val AUC: {auc:.4f}")

        # 로그 저장
        fold_log.append({
            'fold': fold + 1,
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_auc': auc
        })

        # Early Stopping
        if auc > best_auc:
            best_auc = auc
            patience = 0
            torch.save(model.state_dict(), BASE_DIR + f'model/mlp_fold{fold+1}.pt')
            print(f"Fold {fold+1} 모델 저장 (Best AUC: {best_auc:.4f})")
        else:
            patience += 1
            if patience >= early_stopping_rounds:
                print(f"Early Stopping (patience {early_stopping_rounds} 도달)")
                break

    # Fold별 로그 저장
    all_logs.extend(fold_log)

    # Fold OOF
    oof_preds[val_idx] = val_preds




=== Fold 1/5 학습 시작 ===
Fold 1 | Epoch 1 | Train Loss: 0.2381 | Val AUC: 0.6983
✅ Fold 1 모델 저장 (Best AUC: 0.6983)
Fold 1 | Epoch 2 | Train Loss: 0.2270 | Val AUC: 0.6976
Fold 1 | Epoch 3 | Train Loss: 0.2226 | Val AUC: 0.7072
✅ Fold 1 모델 저장 (Best AUC: 0.7072)
Fold 1 | Epoch 4 | Train Loss: 0.2192 | Val AUC: 0.7101
✅ Fold 1 모델 저장 (Best AUC: 0.7101)
Fold 1 | Epoch 5 | Train Loss: 0.2162 | Val AUC: 0.7080
Fold 1 | Epoch 6 | Train Loss: 0.2130 | Val AUC: 0.7056
Fold 1 | Epoch 7 | Train Loss: 0.2096 | Val AUC: 0.7050
❌ Early Stopping (patience 3 도달)

=== Fold 2/5 학습 시작 ===
Fold 2 | Epoch 1 | Train Loss: 0.2375 | Val AUC: 0.7044
✅ Fold 2 모델 저장 (Best AUC: 0.7044)
Fold 2 | Epoch 2 | Train Loss: 0.2251 | Val AUC: 0.7057
✅ Fold 2 모델 저장 (Best AUC: 0.7057)
Fold 2 | Epoch 3 | Train Loss: 0.2209 | Val AUC: 0.7126
✅ Fold 2 모델 저장 (Best AUC: 0.7126)
Fold 2 | Epoch 4 | Train Loss: 0.2177 | Val AUC: 0.7107
Fold 2 | Epoch 5 | Train Loss: 0.2147 | Val AUC: 0.7112
Fold 2 | Epoch 6 | Train Loss: 0.2117 | Val

In [None]:
# 6. 전체 OOF AUC
final_auc = roc_auc_score(y_train, oof_preds)
print(f"\n전체 OOF AUC: {final_auc:.4f}")

# 7. 최종 로그 CSV 저장
log_df = pd.DataFrame(all_logs)
log_df.to_csv(BASE_DIR + 'logs/mlp_training_log_concat.csv', index=False, encoding='utf-8')
print(f"전체 학습 로그 저장 완료: {BASE_DIR}logs/mlp_training_log.csv")


✅ 전체 OOF AUC: 0.7066
✅ 전체 학습 로그 저장 완료: /content/drive/MyDrive/Dacon_FakeText/logs/mlp_training_log.csv
