In [None]:
import warnings
warnings.filterwarnings('ignore')

import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import (
    Compose, ToPILImage, Resize, ToTensor, Normalize,
    RandomHorizontalFlip, RandomAffine, RandomErasing
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR

# Hyperparameters
N_EPOCHS = 100
BATCH_SIZE = 16
LR = 1e-3
N_FOLDS = 10
SEED = 42

# Load Data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

encoder = LabelEncoder()
train['label'] = encoder.fit_transform(train['label'])

mean, std = 0.5, 0.5

# Augmentation
train_transform = Compose([
    ToPILImage(),
    Resize((64, 64)),
    RandomHorizontalFlip(),
    RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    ToTensor(),
    Normalize(mean=[mean], std=[std]),
    RandomErasing(p=0.3, scale=(0.02, 0.2), ratio=(0.3, 3.3))
])

valid_transform = Compose([
    ToPILImage(),
    Resize((64, 64)),
    ToTensor(),
    Normalize(mean=[mean], std=[std])
])

# Dataset
class CustomDataset(Dataset):
    def __init__(self, pixel_df, label_df=None, transform=None):
        self.pixel_df = pixel_df.reset_index(drop=True)
        self.label_df = label_df.reset_index(drop=True) if label_df is not None else None
        self.transform = transform

    def __len__(self):
        return len(self.pixel_df)

    def __getitem__(self, idx):
        image = self.pixel_df.iloc[idx].values.astype(np.uint8).reshape(32, 32)
        image = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
        if self.transform:
            image = self.transform(image)
        if self.label_df is not None:
            label = torch.tensor(self.label_df.iloc[idx], dtype=torch.long)
            return image, label
        return image

# Label Smoothing Loss
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing

    def forward(self, pred, target):
        log_probs = nn.functional.log_softmax(pred, dim=-1)
        true_dist = torch.zeros_like(log_probs)
        true_dist.fill_(self.smoothing / (pred.size(1) - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),
            nn.Dropout(0.1),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),
            nn.Dropout(0.2),

            nn.Conv2d(128, 256, 3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),

            nn.Conv2d(256, 512, 3, padding=1),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.4),  # 여기는 MaxPool 없이 유지

            nn.Flatten(),
            nn.Linear(512 * 8 * 8, 512),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        return self.net(x)

# Device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# K-Fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
all_val_acc = []
all_predictions = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(train.iloc[:, 2:], train['label'])):
    print(f'\n--- Fold {fold+1}/{N_FOLDS} ---')

    train_dataset = CustomDataset(train.iloc[train_idx, 2:], train.iloc[train_idx, 1], transform=train_transform)
    valid_dataset = CustomDataset(train.iloc[valid_idx, 2:], train.iloc[valid_idx, 1], transform=valid_transform)
    test_dataset = CustomDataset(test.iloc[:, 1:], transform=valid_transform)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    model = SimpleCNN().to(device)
    criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = CosineAnnealingLR(optimizer, T_max=N_EPOCHS, eta_min=1e-5)

    best_loss = float('inf')
    best_model = None

    for epoch in range(N_EPOCHS):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)
        train_loss = running_loss / len(train_loader.dataset)

        model.eval()
        correct, total, val_loss = 0, 0, 0.0
        with torch.no_grad():
            for images, labels in valid_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        val_loss /= len(valid_loader.dataset)
        val_acc = correct / total
        print(f"Epoch [{epoch+1}/{N_EPOCHS}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            best_model = copy.deepcopy(model)

        scheduler.step()

    all_val_acc.append(val_acc)

    # Inference
    best_model.eval()
    fold_preds = []
    with torch.no_grad():
        for images in test_loader:
            images = images.to(device)
            outputs = best_model(images)
            _, predicted = torch.max(outputs, 1)
            fold_preds.extend(predicted.cpu().numpy())
    all_predictions.append(fold_preds)

# Majority Voting
final_preds = np.array(all_predictions).T
ensemble_preds = [np.bincount(row).argmax() for row in final_preds]

# Save Submission
submission = pd.DataFrame({
    'ID': test['ID'],
    'label': encoder.inverse_transform(ensemble_preds)
})
submission.to_csv('kfold_simplecnn_submission.csv', index=False)
print("Saved submission as 'kfold_simplecnn_submission.csv'")


# Final Accuracy
print(f"\nAverage Validation Accuracy over {N_FOLDS} folds: {np.mean(all_val_acc) * 100:.2f}%")


--- Fold 1/10 ---
Epoch [1/100] Train Loss: 2.2894 | Val Loss: 2.2205 | Val Acc: 0.1688
Epoch [2/100] Train Loss: 2.1909 | Val Loss: 2.3361 | Val Acc: 0.1948
Epoch [3/100] Train Loss: 2.0066 | Val Loss: 2.4348 | Val Acc: 0.2078
Epoch [4/100] Train Loss: 1.9016 | Val Loss: 1.7161 | Val Acc: 0.4545
Epoch [5/100] Train Loss: 1.7673 | Val Loss: 1.4558 | Val Acc: 0.5714
Epoch [6/100] Train Loss: 1.6225 | Val Loss: 1.4931 | Val Acc: 0.5584
Epoch [7/100] Train Loss: 1.5322 | Val Loss: 1.3302 | Val Acc: 0.5974
Epoch [8/100] Train Loss: 1.4601 | Val Loss: 1.3991 | Val Acc: 0.6623
Epoch [9/100] Train Loss: 1.3320 | Val Loss: 1.2420 | Val Acc: 0.7013
Epoch [10/100] Train Loss: 1.3332 | Val Loss: 1.1224 | Val Acc: 0.7922
Epoch [11/100] Train Loss: 1.2437 | Val Loss: 1.2056 | Val Acc: 0.7013
Epoch [12/100] Train Loss: 1.1774 | Val Loss: 1.2340 | Val Acc: 0.7532
Epoch [13/100] Train Loss: 1.1420 | Val Loss: 0.9486 | Val Acc: 0.8442
Epoch [14/100] Train Loss: 1.1720 | Val Loss: 1.0369 | Val Acc: 0.8

KeyboardInterrupt: 