In [1]:
import torch
import torch.nn as nn
import torch.nn.parallel
import numpy as np
import time

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(1000, 2000),
            nn.ReLU(),
            nn.Linear(2000, 500),
            nn.ReLU(),
            nn.Linear(500, 10)
        )

    def forward(self, x):
        return self.layers(x)

def test_gpu_parallel():
    # Check CUDA availability
    if not torch.cuda.is_available():
        print("CUDA is not available. Please check your PyTorch installation.")
        return

    # Print GPU information
    gpu_count = torch.cuda.device_count()
    print(f"Number of GPUs available: {gpu_count}")
    for i in range(gpu_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

    # Create model and move to GPU
    model = SimpleNet()
    model = nn.DataParallel(model)
    model = model.cuda()
    print("\nModel moved to GPU and wrapped in DataParallel")

    # Create sample input data
    batch_size = 256
    input_data = torch.randn(batch_size, 1000).cuda()
    target = torch.randn(batch_size, 10).cuda()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Test forward and backward pass
    print("\nTesting forward and backward pass...")
    start_time = time.time()
    
    # Run for a few iterations to test
    for i in range(10):
        optimizer.zero_grad()
        output = model(input_data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if i % 2 == 0:
            print(f"Iteration {i}, Loss: {loss.item():.6f}")

    end_time = time.time()
    print(f"\nTime taken for 10 iterations: {end_time - start_time:.2f} seconds")

    # Verify GPU memory usage
    print("\nGPU Memory Usage:")
    for i in range(gpu_count):
        memory_allocated = torch.cuda.memory_allocated(i) / 1024**2
        memory_cached = torch.cuda.memory_reserved(i) / 1024**2
        print(f"GPU {i}:")
        print(f"  Allocated: {memory_allocated:.2f} MB")
        print(f"  Cached:    {memory_cached:.2f} MB")

if __name__ == "__main__":
    test_gpu_parallel()

: 

In [None]:
torch.multiprocessing.set_start_method('spawn', force=True)
test_gpu_parallel()

Number of GPUs available: 8
GPU 0: NVIDIA A16
GPU 1: NVIDIA A16
GPU 2: NVIDIA A16
GPU 3: NVIDIA A16
GPU 4: NVIDIA A16
GPU 5: NVIDIA A16
GPU 6: NVIDIA A16
GPU 7: NVIDIA A16
Current CUDA device: 0

Creating model...
Model created
Model on CPU
Creating input data...
Input data created
Moving model to CUDA...
Model on CUDA
Wrapping in DataParallel...
Model wrapped in DataParallel
Moving data to CUDA...
Data moved to CUDA

Testing forward and backward pass...
Starting iteration 0
Forward pass 0
frostrend:1297655:1297655 [0] NCCL INFO cudaDriverVersion 12060
frostrend:1297655:1297655 [0] NCCL INFO Bootstrap : Using ens97f0:10.10.133.215<0>
frostrend:1297655:1297655 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
NCCL version 2.20.5+cuda11.8
frostrend:1297655:1297712 [1] NCCL INFO Failed to open libibverbs.so[.1]
frostrend:1297655:1297712 [1] NCCL INFO NET/Socket : Usin