In [4]:
!export NCCL_P2P_DISABLE=1
!export NCCL_IB_DISABLE=1
!export CUDA_VISIBLE_DEVICES=0,1,2,3,4


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# カスタムデータセットの定義
class RandomDataset(Dataset):
    def __init__(self, input_size, output_size, data_size):
        self.input_size = input_size
        self.output_size = output_size
        self.data_size = data_size
        self.data = torch.randn(data_size, input_size)
        self.labels = torch.randn(data_size, output_size)

    def __len__(self):
        return self.data_size

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# シンプルなモデルの定義
class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.fc(x)

# メイン関数
def main():
    input_size = 204800
    output_size = 102400
    data_size = 10000
    batch_size = 256

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    rand_loader = DataLoader(dataset=RandomDataset(input_size, output_size, data_size),
                             batch_size=batch_size, shuffle=True)

    model = Model(input_size, output_size)

    model.to(device)


    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs.")
        #model = nn.DataParallel(model)


    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    for i, (data, label) in enumerate(rand_loader):
        data, label = data.to(device), label.to(device)

        # フォワードパス
        output = model(data)

        # 損失計算
        loss = loss_fn(output, label)

        # 勾配の更新
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 損失の出力
        print(f"Batch {i + 1}, Loss: {loss.item()}")

        # GPUメモリ使用量の出力
        allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)  # GB単位
        reserved = torch.cuda.memory_reserved(device) / (1024 ** 3)  # GB単位
        print(f"GPU Memory Allocated: {allocated:.2f} GB")
        print(f"GPU Memory Reserved: {reserved:.2f} GB")

if __name__ == "__main__":
    main()


OutOfMemoryError: CUDA out of memory. Tried to allocate 78.12 GiB. GPU 0 has a total capacity of 15.64 GiB of which 15.38 GiB is free. Including non-PyTorch memory, this process has 242.00 MiB memory in use. Of the allocated memory 16.25 MiB is allocated by PyTorch, and 23.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)