In [1]:
# Cell 0: 모듈 import를 위한 경로 설정
import os, sys
sys.path.append(os.path.abspath(".."))  # shared, models 디렉토리 접근 가능하도록 경로 추가


In [2]:
# Cell 1: 환경 확인
import torch

print(f"✅ PyTorch version: {torch.__version__}")
print(f"🚀 GPU available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print(f"🧠 GPU name: {torch.cuda.get_device_name(0)}")
    
    total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**3  # GiB
    reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # GiB
    allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # GiB
    free_memory = reserved_memory - allocated_memory  # GiB

    print(f"💾 Total memory: {total_memory:.2f} GiB")
    print(f"📦 Reserved memory: {reserved_memory:.2f} GiB")
    print(f"📈 Allocated memory: {allocated_memory:.2f} GiB")
    print(f"🟢 Free memory in reserved: {free_memory:.2f} GiB")


✅ PyTorch version: 2.6.0+cu124
🚀 GPU available: True
🧠 GPU name: Quadro RTX 5000
💾 Total memory: 15.73 GiB
📦 Reserved memory: 0.00 GiB
📈 Allocated memory: 0.00 GiB
🟢 Free memory in reserved: 0.00 GiB


In [3]:
# Cell 2: 데이터셋 로딩
from torch.utils.data import DataLoader
from shared.data_loader import HDF5Dataset
import os

input_dir = "/caefs/data/IllustrisTNG/subcube/input"
output_dir = "/caefs/data/IllustrisTNG/subcube/output"

input_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".h5")])
output_files = sorted([os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".h5")])

dataset = HDF5Dataset(input_files, output_files)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

x, y = next(iter(loader))
print(f"✅ Sample loaded: input shape = {x.shape}, output shape = {y.shape}")


2025-06-17 10:18:22,442 | INFO | data_loader | 🔍 Initializing dataset with 12 file pairs.
2025-06-17 10:18:22,468 | INFO | data_loader | 📦 Total samples across all files: 110592


✅ Sample loaded: input shape = torch.Size([2, 1, 60, 60, 60]), output shape = torch.Size([2, 1, 60, 60, 60])


In [4]:
from models.fno.model import FNO
import torch
from torchinfo import summary
import torch.nn as nn

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# FNO 모델 초기화
model = FNO(
    in_channels=1,
    out_channels=1,
    modes1=32,
    modes2=32,
    modes3=32,
    width=128,
    lifting_channels=128,
    add_grid=True,
    activation=nn.ReLU
).to(device)

model.train()
print("✅ FNO model loaded and set to training mode.")

2025-06-17 10:20:59,963 | INFO | models.fno.model | ✅ FNO model initialized successfully.


✅ FNO model loaded and set to training mode.


In [5]:
summary(model, input_size=(2, 1, 60, 60, 60),
        col_names=["input_size", "output_size", "num_params", "kernel_size"])


2025-06-17 10:21:00,073 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([2, 1, 60, 60, 60])
2025-06-17 10:21:00,073 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:00,078 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:00,087 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([2, 4, 60, 60, 60])
2025-06-17 10:21:00,271 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:00,273 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
2025-06-17 10:21:00,275 | INFO | models.fno.model | 🔁 Passed through Fourier layer 3/4
2025-06-17 10:21:00,278 | INFO | models.fno.model | 🔁 Passed through Fourier layer 4/4
2025-06-17 10:21:00,624 | INFO | models.fno.model | ✅ Forward pass completed. Output shape: torch.Size([2, 1, 60, 60, 60])


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape
FNO                                      [2, 1, 60, 60, 60]        [2, 1, 60, 60, 60]        --                        --
├─Linear: 1-1                            [432000, 4]               [432000, 64]              320                       --
├─Linear: 1-2                            [432000, 64]              [432000, 128]             8,320                     --
├─ModuleList: 1-3                        --                        --                        --                        --
│    └─SpectralConvolution: 2-1          [2, 128, 60, 60, 60]      [2, 128, 60, 60, 60]      70,528                    --
│    └─SpectralConvolution: 2-2          [2, 128, 60, 60, 60]      [2, 128, 60, 60, 60]      70,528                    --
│    └─SpectralConvolution: 2-3          [2, 128, 60, 60, 60]      [2, 128, 60, 60, 60]      70,528                    --
│    └─Spectra

In [6]:
def test_batch_size(batch_size):
    try:
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        model.eval()
        with torch.no_grad():
            for x, y in loader:
                x, y = x.to(device), y.to(device)
                _ = model(x)
                print(f"✅ Success with batch_size={batch_size}")
                break
    except RuntimeError as e:
        print(f"❌ Failed with batch_size={batch_size}: {str(e).splitlines()[0]}")

for bs in [32, 16, 8, 4, 2, 1]:
    test_batch_size(bs)


2025-06-17 10:21:00,779 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([32, 1, 60, 60, 60])
2025-06-17 10:21:00,780 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:00,781 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:00,781 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([32, 4, 60, 60, 60])
2025-06-17 10:21:01,294 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([16, 1, 60, 60, 60])
2025-06-17 10:21:01,295 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:01,296 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:01,296 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([16, 4, 60, 60, 60])


❌ Failed with batch_size=32: CUDA out of memory. Tried to allocate 6.59 GiB. GPU 0 has a total capacity of 15.73 GiB of which 504.50 MiB is free. Process 12502 has 26.06 MiB memory in use. Process 133015 has 5.05 GiB memory in use. Including non-PyTorch memory, this process has 10.16 GiB memory in use. Of the allocated memory 9.95 GiB is allocated by PyTorch, and 63.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


2025-06-17 10:21:01,669 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([8, 1, 60, 60, 60])
2025-06-17 10:21:01,670 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:01,670 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:01,671 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([8, 4, 60, 60, 60])
2025-06-17 10:21:01,751 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:01,753 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
2025-06-17 10:21:01,755 | INFO | models.fno.model | 🔁 Passed through Fourier layer 3/4
2025-06-17 10:21:01,757 | INFO | models.fno.model | 🔁 Passed through Fourier layer 4/4
2025-06-17 10:21:01,758 | INFO | models.fno.model | ✅ Forward pass completed. Output shape: torch.Size([8, 1, 60, 60, 60])


❌ Failed with batch_size=16: CUDA out of memory. Tried to allocate 3.30 GiB. GPU 0 has a total capacity of 15.73 GiB of which 504.50 MiB is free. Process 12502 has 26.06 MiB memory in use. Process 133015 has 5.05 GiB memory in use. Including non-PyTorch memory, this process has 10.16 GiB memory in use. Of the allocated memory 8.28 GiB is allocated by PyTorch, and 1.74 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
✅ Success with batch_size=8


2025-06-17 10:21:02,554 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([4, 1, 60, 60, 60])
2025-06-17 10:21:02,555 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:02,556 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:02,556 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([4, 4, 60, 60, 60])
2025-06-17 10:21:02,601 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:02,604 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
2025-06-17 10:21:02,606 | INFO | models.fno.model | 🔁 Passed through Fourier layer 3/4
2025-06-17 10:21:02,608 | INFO | models.fno.model | 🔁 Passed through Fourier layer 4/4
2025-06-17 10:21:02,609 | INFO | models.fno.model | ✅ Forward pass completed. Output shape: torch.Size([4, 1, 60, 60, 60])


✅ Success with batch_size=4


2025-06-17 10:21:03,166 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([2, 1, 60, 60, 60])
2025-06-17 10:21:03,167 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:03,168 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:03,168 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([2, 4, 60, 60, 60])
2025-06-17 10:21:03,171 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:03,173 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
2025-06-17 10:21:03,175 | INFO | models.fno.model | 🔁 Passed through Fourier layer 3/4
2025-06-17 10:21:03,177 | INFO | models.fno.model | 🔁 Passed through Fourier layer 4/4
2025-06-17 10:21:03,178 | INFO | models.fno.model | ✅ Forward pass completed. Output shape: torch.Size([2, 1, 60, 60, 60])


✅ Success with batch_size=2


2025-06-17 10:21:03,640 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([1, 1, 60, 60, 60])
2025-06-17 10:21:03,641 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:03,642 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:03,642 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([1, 4, 60, 60, 60])
2025-06-17 10:21:03,659 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:03,661 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
2025-06-17 10:21:03,663 | INFO | models.fno.model | 🔁 Passed through Fourier layer 3/4
2025-06-17 10:21:03,666 | INFO | models.fno.model | 🔁 Passed through Fourier layer 4/4
2025-06-17 10:21:03,667 | INFO | models.fno.model | ✅ Forward pass completed. Output shape: torch.Size([1, 1, 60, 60, 60])


✅ Success with batch_size=1


In [7]:
from shared.losses import mse_loss, spectral_loss

loss_val = mse_loss(x.to(device), y.to(device))
print(f"✅ MSE Loss on sample batch: {loss_val.item():.4f}")


✅ MSE Loss on sample batch: 16.1741


In [8]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
print("✅ Optimizer and LR scheduler initialized.")


✅ Optimizer and LR scheduler initialized.


In [9]:
from tqdm import tqdm

model.train()
loader = DataLoader(dataset, batch_size=4, shuffle=True)
n_batch = 10

for epoch in range(3):
    total_loss = 0.0
    for i, (inputs, targets) in enumerate(tqdm(loader, desc=f"Epoch {epoch+1}")):
        if i >= n_batch:
            break
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = mse_loss(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    print(f"📉 Epoch {epoch+1} Loss: {total_loss / n_batch:.4f} | LR: {scheduler.get_last_lr()[0]:.2e}")


Epoch 1:   0%|          | 0/27648 [00:00<?, ?it/s]2025-06-17 10:21:06,769 | INFO | models.fno.model | 🚀 FNO forward pass started. Input shape: torch.Size([4, 1, 60, 60, 60])
2025-06-17 10:21:06,770 | INFO | models.fno.model | 🌐 Generating coordinate grid with shape: [60, 60, 60]
2025-06-17 10:21:06,771 | INFO | models.fno.model | ✅ Coordinate grid generated.
2025-06-17 10:21:06,772 | INFO | models.fno.model | 🔗 Added grid to input. New shape: torch.Size([4, 4, 60, 60, 60])
2025-06-17 10:21:06,778 | INFO | models.fno.model | 🔁 Passed through Fourier layer 1/4
2025-06-17 10:21:06,783 | INFO | models.fno.model | 🔁 Passed through Fourier layer 2/4
Epoch 1:   0%|          | 0/27648 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 844.00 MiB. GPU 0 has a total capacity of 15.73 GiB of which 504.50 MiB is free. Process 12502 has 26.06 MiB memory in use. Process 133015 has 5.05 GiB memory in use. Including non-PyTorch memory, this process has 10.16 GiB memory in use. Of the allocated memory 8.57 GiB is allocated by PyTorch, and 1.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Cell 4: Trainer 설정 (디버깅용)
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import CSVLogger

trainer = Trainer(
    max_epochs=2,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    log_every_n_steps=1,
    enable_progress_bar=True,
    detect_anomaly=True,
    logger=CSVLogger("logs/debug_fno", name="fno_test"),
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=2, mode="min", verbose=True)
    ]
)

In [None]:
save_path = "fno_test_model.pt"
torch.save(model.state_dict(), save_path)
print(f"✅ FNO model saved to {save_path}")

state_dict = torch.load(save_path, map_location='cpu')
print(f"🔍 저장된 키 개수: {len(state_dict)}")
print("예시 키:", list(state_dict.keys())[:5])
