# GPU Setup for Remote Server

**Optimized for remote execution** - Sets CUDA_VISIBLE_DEVICES before importing torch to prevent hanging.

In [2]:
# CRITICAL: Set CUDA_VISIBLE_DEVICES BEFORE importing torch
# This prevents PyTorch from hanging on remote servers
import os
import subprocess

# Find free GPUs (less than 5% memory used)
def find_free_gpus():
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=index,memory.used,memory.total', '--format=csv,noheader,nounits'],
            capture_output=True, text=True, check=True, timeout=5
        )
        free_gpus = []
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                parts = [p.strip() for p in line.split(',')]
                gpu_id = int(parts[0])
                used = int(parts[1])
                total = int(parts[2])
                if (used / total) * 100 < 5:
                    free_gpus.append(gpu_id)
        return free_gpus
    except:
        return [1, 2, 3, 5]  # Default safe GPUs

free_gpus = find_free_gpus()
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, free_gpus))
print(f"✓ Set CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")
print(f"✓ Free GPUs: {free_gpus}")

✓ Set CUDA_VISIBLE_DEVICES=1,2,3,5
✓ Free GPUs: [1, 2, 3, 5]


In [3]:
# Now import torch (it will only see the GPUs in CUDA_VISIBLE_DEVICES)
import torch
import sys

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Visible GPUs: {torch.cuda.device_count()}")
    
    # Test GPU access (with timeout protection)
    try:
        for i in range(torch.cuda.device_count()):
            name = torch.cuda.get_device_name(i)
            print(f"  GPU {i}: {name}")
    except Exception as e:
        print(f"  Warning: {e}")
else:
    print("CUDA not available")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
Visible GPUs: 4
  GPU 0: NVIDIA A100-PCIE-40GB
  GPU 1: NVIDIA A100-PCIE-40GB
  GPU 2: NVIDIA A100-PCIE-40GB
  GPU 3: NVIDIA A100-PCIE-40GB


In [4]:
# Set default device and test computation
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print(f"Using device: {device}")
    
    # Simple GPU test
    try:
        x = torch.randn(100, 100, device=device)
        y = torch.randn(100, 100, device=device)
        z = torch.matmul(x, y)
        print(f"✓ GPU computation successful!")
        print(f"  Result shape: {z.shape}, device: {z.device}")
    except Exception as e:
        print(f"✗ GPU test failed: {e}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using device: cuda:0
✓ GPU computation successful!
  Result shape: torch.Size([100, 100]), device: cuda:0


In [5]:
# Multi-GPU example
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    
    class SimpleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(100, 100)
        def forward(self, x):
            return self.linear(x)
    
    model = SimpleModel()
    model = torch.nn.DataParallel(model)
    model = model.to(device)
    
    test_input = torch.randn(10, 100).to(device)
    output = model(test_input)
    print(f"✓ Multi-GPU test successful! Output: {output.shape}")
elif torch.cuda.is_available():
    print("Single GPU mode")
else:
    print("CUDA not available")

Using 4 GPUs


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
