In [None]:
""" dataset.ipynb """

# import
import os
import time
import glob
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torchvision.transforms as transforms

In [None]:
# Google Drive
!cp /content/drive/MyDrive/img_align_celeba.zip /content/ # 데이터 저장 위치를 작성

!unzip -q /content/drive/MyDrive/img_align_celeba.zip -d /content/data/ # 데이터 저장 위치 작성

data_dir = '/content/data/img_align_celeba'

## Dataset

In [None]:
# Parameter
batch_size = 16
sample_size = 30000
seed = 42

In [None]:
# Transform 정의
transform = transforms.Compose([
    transforms.CenterCrop(178),
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# CelebA 데이터셋
class CelebADataset(Dataset):
  def __init__(self, img_dir, transform=None):
    self.img_paths = sorted(glob.glob(os.path.join(img_dir, "*.jpg")))
    self.transform = transform

  def __len__(self):
    return len(self.img_paths)

  def __getitem__(self, idx):
    image = Image.open(self.img_paths[idx]).convert("RGB")
    if self.transform:
      image = self.transform(image)
    return image

# Dataset 로딩
full_dataset = CelebADataset(data_dir, transform=transform)
full_indices = list(range(len(full_dataset)))

# 30000장 샘플링
sampled_count = min(sample_size, len(full_dataset))
random.seed(seed)
sampled_indices = random.sample(full_indices, sampled_count)
np.save('/content/drive/MyDrive/ProGAN/train_indices.npy', sampled_indices)

subset_dataset = Subset(full_dataset, sampled_indices)

# 로딩 속도 확인
load_loader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
start = time.time()
for _ in tqdm(load_loader, desc="Loading 30000 images"): pass
print(f"Loaded {sampled_count} images in {(time.time()-start):.2f}s")

# Train/Val 분할
train_img = int(0.8 * sampled_count)
val_img = sampled_count - train_img
train_ds, val_ds = random_split(subset_dataset, [train_img, val_img])

# DataLoader 생성
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

# 평가용 데이터셋에서 학습에 사용된 인덱스를 제외한 나머지를 저장 - evaluate에서 사용
unused_indices = list(set(full_indices) - set(sampled_indices))
np.save('/content/drive/MyDrive/ProGAN/unused_indices_for_eval.npy', unused_indices)