# CV 문서분류 경진대회 - ConvNeXt Base 384 (간결화 버전)
## 5-Fold Cross Validation + Ensemble TTA

**성능 목표**: CV F1 0.95+ 유지

## 1. 환경 설정 및 라이브러리

In [1]:
import os
import time
import random
import copy

import timm
import torch
import cv2
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import autocast, GradScaler

from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

# 한글 폰트 설정 (시각화용)
plt.rcParams['font.family'] = ['DejaVu Sans']

In [2]:
# 시드 고정
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

## 2. 설정 및 데이터셋

In [3]:
# 설정 통합
CONFIG = {
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'data_path': '../data/',
    'model_name': 'convnext_base_384_in22ft1k',
    'img_size': 384,
    'lr': 2e-4,
    'epochs': 20,
    'batch_size': 32,
    'num_workers': 32,
    'n_folds': 5,
    'label_smoothing': 0.05
}

print(f"Device: {CONFIG['device']}")
print(f"Model: {CONFIG['model_name']}")

Device: cuda
Model: convnext_base_384_in22ft1k


In [4]:
def mixup_data(x, y, alpha=1.0):
    """Mixup 데이터 증강"""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).cuda()
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

class ImageDataset(Dataset):
    """적응형 Hard Augmentation 지원 데이터셋"""
    def __init__(self, data, path, total_epochs=10, is_train=True):
        self.df = pd.read_csv(data).values if isinstance(data, str) else data.values
        self.path = path
        self.is_train = is_train
        self.total_epochs = total_epochs
        self.current_epoch = 0  # 현재 에포크 추적
        
        self._update_transforms()

    def set_epoch(self, epoch):
        """에포크 업데이트 메서드"""
        self.current_epoch = epoch
        self._update_transforms()
    
    def _update_transforms(self):
        """에포크에 따른 증강 변환 업데이트"""
        # Hard augmentation 확률 계산
        p_hard = 0.2 + 0.3 * (self.current_epoch / self.total_epochs) if self.is_train else 0
        
        # Normal augmentation
        self.normal_aug = A.Compose([
            A.LongestMaxSize(max_size=CONFIG['img_size']),
            A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
            A.OneOf([
                A.Rotate(limit=[90,90], p=1.0),
                A.Rotate(limit=[180,180], p=1.0),
                A.Rotate(limit=[270,270], p=1.0),
            ], p=0.6),
            A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.8),
            A.GaussNoise(var_limit=(30.0, 100.0), p=0.7),
            A.HorizontalFlip(p=0.5),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ])
        
        # Hard augmentation
        self.hard_aug = A.Compose([
            A.LongestMaxSize(max_size=CONFIG['img_size']),
            A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
            A.OneOf([
                A.Rotate(limit=[90,90], p=1.0),
                A.Rotate(limit=[180,180], p=1.0),
                A.Rotate(limit=[270,270], p=1.0),
                A.Rotate(limit=[-15,15], p=1.0),
            ], p=0.8),
            A.OneOf([
                A.MotionBlur(blur_limit=15, p=1.0),
                A.GaussianBlur(blur_limit=15, p=1.0),
            ], p=0.95),
            A.RandomBrightnessContrast(brightness_limit=0.5, contrast_limit=0.5, p=0.9),
            A.GaussNoise(var_limit=(50.0, 150.0), p=0.8),
            A.JpegCompression(quality_lower=70, quality_upper=100, p=0.5),
            A.HorizontalFlip(p=0.5),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ])
        
        self.p_hard = p_hard

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)).convert("RGB"))
        
        # 증강 선택
        if self.is_train and random.random() < self.p_hard:
            img = self.hard_aug(image=img)['image']
        else:
            img = self.normal_aug(image=img)['image']
        
        return img, target

## 3. 학습 및 검증 함수

In [None]:
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    """한 에포크 학습"""
    scaler = GradScaler()
    model.train()
    total_loss, preds_list, targets_list = 0, [], []

    for image, targets in tqdm(loader, desc="Training"):
        image, targets = image.to(device), targets.to(device)
        
        # Mixup 적용 (30% 확률)
        if random.random() < 0.3:
            mixed_x, y_a, y_b, lam = mixup_data(image, targets, alpha=1.0)
            with autocast(): 
                preds = model(mixed_x)
            loss = lam * loss_fn(preds, y_a) + (1 - lam) * loss_fn(preds, y_b)
        else:
            with autocast(): 
                preds = model(image)
            loss = loss_fn(preds, targets)

        model.zero_grad(set_to_none=True)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(targets.detach().cpu().numpy())

    return {
        "train_loss": total_loss / len(loader),
        "train_acc": accuracy_score(targets_list, preds_list),
        "train_f1": f1_score(targets_list, preds_list, average='macro'),
    }

def validate_one_epoch(loader, model, loss_fn, device):
    """한 에포크 검증"""
    model.eval()
    total_loss, n_samples, preds_list, targets_list = 0.0, 0, [], []
    
    use_amp = torch.cuda.is_available()

    # no_grad 대신 inference_mode: 내부 그래프/메모리 더 공격적으로 해제 → 속도/메모리 이점
    with torch.inference_mode():
        for images, targets in tqdm(loader, desc="Validating", leave=False):
            # non_blocking=True로 H2D 전송 최적화 (pin_memory=True와 궁합)
            images = images.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            # 검증에서도 autocast: 연산량/메모리 감소, 정확도엔 영향 없음(CE는 FP32 누수 없이 OK)
            with torch.cuda.amp.autocast(enabled=use_amp):
                logits = model(images)
                loss = loss_fn(logits, targets)

            # 평균 loss를 위해 배치 크기만큼 가중 합
            bsz = images.size(0)
            total_loss += loss.item() * bsz
            n_samples += bsz

            # 예측 및 정답 수집 (후단에서 F1/ACC 계산)
            preds_list.extend(logits.argmax(dim=1).detach().cpu().tolist())
            targets_list.extend(targets.detach().cpu().tolist())

    avg_loss = total_loss / max(1, n_samples)
    val_acc = accuracy_score(targets_list, preds_list)
    val_f1  = f1_score(targets_list, preds_list, average="macro")

    return {"val_loss": avg_loss, "val_acc": val_acc, "val_f1": val_f1}

## 4. K-Fold Cross Validation

In [6]:
# train_single_fold 함수 수정 - 데이터셋 epoch 업데이트 추가
def train_single_fold(fold, train_idx, val_idx, train_df):
    """단일 Fold 학습"""
    print(f"\n{'='*50}\nFOLD {fold + 1}/{CONFIG['n_folds']}\n{'='*50}")
    
    # 데이터 분할
    train_fold_df = train_df.iloc[train_idx].reset_index(drop=True)
    val_fold_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    # 데이터셋 및 로더 생성
    trn_dataset = ImageDataset(train_fold_df, CONFIG['data_path'] + "train/", 
                              total_epochs=CONFIG['epochs'], is_train=True)
    val_dataset = ImageDataset(val_fold_df, CONFIG['data_path'] + "train/", 
                              total_epochs=CONFIG['epochs'], is_train=False)
    
    trn_loader = DataLoader(trn_dataset, batch_size=CONFIG['batch_size'], shuffle=True, 
                           num_workers=CONFIG['num_workers'], pin_memory=True, drop_last=False)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, 
                           num_workers=CONFIG['num_workers'], pin_memory=True)
    
    print(f"Train samples: {len(trn_dataset)}, Validation samples: {len(val_dataset)}")
    
    # 모델 및 최적화 설정
    model = timm.create_model(CONFIG['model_name'], pretrained=True, num_classes=17).to(CONFIG['device'])
    loss_fn = nn.CrossEntropyLoss(label_smoothing=CONFIG['label_smoothing'])
    optimizer = Adam(model.parameters(), lr=CONFIG['lr'])
    scheduler = CosineAnnealingLR(optimizer, T_max=CONFIG['epochs'])
    
    best_val_f1 = 0.0
    best_model = None
    
    # 학습 루프
    for epoch in range(CONFIG['epochs']):
        # ★ 핵심: 매 에포크마다 데이터셋 업데이트
        trn_dataset.set_epoch(epoch)
        
        train_ret = train_one_epoch(trn_loader, model, optimizer, loss_fn, CONFIG['device'])
        val_ret = validate_one_epoch(val_loader, model, loss_fn, CONFIG['device'])
        scheduler.step()
        
        print(f"Epoch {epoch+1:2d} | Train Loss: {train_ret['train_loss']:.4f} | "
              f"Train F1: {train_ret['train_f1']:.4f} | Val Loss: {val_ret['val_loss']:.4f} | "
              f"Val F1: {val_ret['val_f1']:.4f}")
        
        if val_ret['val_f1'] > best_val_f1:
            best_val_f1 = val_ret['val_f1']
            best_model = copy.deepcopy(model.state_dict())
    
    # GPU 메모리 정리
    torch.cuda.empty_cache()
    
    print(f"Fold {fold + 1} Best Validation F1: {best_val_f1:.4f}")
    return best_val_f1, best_model

# K-Fold 실행
train_df = pd.read_csv(CONFIG['data_path'] + "train.csv")
skf = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=SEED)

fold_results = []
fold_models = []

print(f"Starting {CONFIG['n_folds']}-Fold Cross Validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    best_f1, best_model = train_single_fold(fold, train_idx, val_idx, train_df)
    fold_results.append(best_f1)
    fold_models.append(best_model)

# 결과 요약
mean_f1, std_f1 = np.mean(fold_results), np.std(fold_results)
print(f"\n{'='*60}\nK-FOLD CROSS VALIDATION RESULTS\n{'='*60}")
for i, f1 in enumerate(fold_results):
    print(f"Fold {i+1}: {f1:.4f}")
print(f"\nMean CV F1: {mean_f1:.4f} ± {std_f1:.4f}")
print(f"Best single fold: {max(fold_results):.4f}")

Starting 5-Fold Cross Validation...

FOLD 1/5
Train samples: 1256, Validation samples: 314


Training: 100%|██████████| 40/40 [00:29<00:00,  1.33it/s]
                                                           

TypeError: tuple indices must be integers or slices, not str

## 5. TTA 추론 및 앙상블

In [None]:
# TTA 변형 정의
tta_transforms = [
    # 원본
    A.Compose([
        A.LongestMaxSize(max_size=CONFIG['img_size']),
        A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    # 90도 회전들
    A.Compose([
        A.LongestMaxSize(max_size=CONFIG['img_size']),
        A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
        A.Rotate(limit=[90, 90], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    A.Compose([
        A.LongestMaxSize(max_size=CONFIG['img_size']),
        A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
        A.Rotate(limit=[180, 180], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    A.Compose([
        A.LongestMaxSize(max_size=CONFIG['img_size']),
        A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
        A.Rotate(limit=[-90, -90], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
    # 밝기 개선
    A.Compose([
        A.LongestMaxSize(max_size=CONFIG['img_size']),
        A.PadIfNeeded(min_height=CONFIG['img_size'], min_width=CONFIG['img_size'], border_mode=0, value=0),
        A.RandomBrightnessContrast(brightness_limit=[0.3, 0.3], contrast_limit=[0.3, 0.3], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ]),
]

In [None]:
class TTADataset(Dataset):
    """TTA 추론용 데이터셋"""
    def __init__(self, data, path, transforms):
        self.df = pd.read_csv(data).values if isinstance(data, str) else data.values
        self.path = path
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        
        augmented_images = []
        for transform in self.transforms:
            aug_img = transform(image=img)['image']
            augmented_images.append(aug_img)
        
        return augmented_images, target

def ensemble_tta_inference(models, loader):
    """5-Fold 모델 앙상블 + TTA 추론"""
    all_predictions = []
    
    for batch_idx, (images_list, _) in enumerate(tqdm(loader, desc="Ensemble TTA")):
        batch_size = images_list[0].size(0)
        ensemble_probs = torch.zeros(batch_size, 17).to(CONFIG['device'])
        
        # 각 fold 모델별 예측
        for model in models:
            with torch.no_grad():
                # 각 TTA 변형별 예측
                for images in images_list:
                    images = images.to(CONFIG['device'])
                    preds = model(images)
                    probs = torch.softmax(preds, dim=1)
                    ensemble_probs += probs / (len(models) * len(images_list))
        
        final_preds = torch.argmax(ensemble_probs, dim=1)
        all_predictions.extend(final_preds.cpu().numpy())
    
    return all_predictions

In [None]:
# 앙상블 모델 준비
ensemble_models = []
for state_dict in fold_models:
    model = timm.create_model(CONFIG['model_name'], pretrained=False, num_classes=17).to(CONFIG['device'])
    model.load_state_dict(state_dict)
    model.eval()
    ensemble_models.append(model)

# TTA 데이터셋 및 로더 생성
tta_dataset = TTADataset(CONFIG['data_path'] + "sample_submission.csv", 
                        CONFIG['data_path'] + "test/", tta_transforms)
tta_loader = DataLoader(tta_dataset, batch_size=64, shuffle=False, 
                       num_workers=8, pin_memory=True, persistent_workers=True,
                       worker_init_fn=_seed_worker, generator=g)

print(f"Using ensemble of {len(ensemble_models)} fold models for inference")
print(f"TTA Dataset size: {len(tta_dataset)}")

Using ensemble of 5 fold models for inference
TTA Dataset size: 3140


In [None]:
# TTA 추론 실행
print("Starting Ensemble TTA inference...")
start_time = time.time()
tta_predictions = ensemble_tta_inference(ensemble_models, tta_loader)
inference_time = time.time() - start_time

print(f"Inference completed in {inference_time//60:.0f}m {inference_time%60:.0f}s")

Starting Ensemble TTA inference...


Ensemble TTA: 100%|██████████| 50/50 [09:25<00:00, 11.31s/it]

Inference completed in 9m 25s





## 6. 결과 저장 및 검증

In [None]:
# 결과 저장
tta_pred_df = pd.DataFrame(tta_dataset.df, columns=['ID', 'target'])
tta_pred_df['target'] = tta_predictions

In [None]:
# 검증
sample_submission_df = pd.read_csv(CONFIG['data_path'] + "sample_submission.csv")
try:
    assert (sample_submission_df['ID'] == tta_pred_df['ID']).all()
    print("✓ Submission format verified")
except AssertionError:
    print("✗ Submission format error")
    raise

✓ Submission format verified


In [None]:
# 최종 저장
tta_pred_df.to_csv("../submission/choice.csv", index=False)
print("\n✓ Final predictions saved to choice.csv")
print(f"✓ CV Performance: {mean_f1:.4f} ± {std_f1:.4f}")
print(f"✓ Inference Time: {inference_time//60:.0f}m {inference_time%60:.0f}s")


✓ Final predictions saved to choice.csv
✓ CV Performance: 0.9378 ± 0.0062
✓ Inference Time: 9m 25s


In [None]:
# 샘플 출력
print("\nPrediction sample:")
tta_pred_df.head()


Prediction sample:


Unnamed: 0,ID,target
0,0008fdb22ddce0ce.jpg,2
1,00091bffdffd83de.jpg,12
2,00396fbc1f6cc21d.jpg,5
3,00471f8038d9c4b6.jpg,12
4,00901f504008d884.jpg,2
