# GPU Setup and Detection Notebook

This notebook detects and configures available GPUs for use with PyTorch and other deep learning frameworks.

## Available GPUs
The system has 8 NVIDIA A100 GPUs (40GB each) available.


In [1]:
# ============================================================================
# CRITICAL: Run this cell FIRST. If you see an error, restart kernel first!
# ============================================================================
import sys

# Safety check: torch must NOT be imported yet
if 'torch' in sys.modules:
    print("\n" + "="*70)
    print("⚠️  ERROR: torch is already imported!")
    print("="*70)
    print("\nYou MUST:")
    print("  1. Restart the kernel (Kernel → Restart Kernel)")
    print("  2. Run this cell FIRST (before any other cells)")
    print("\nCUDA_VISIBLE_DEVICES must be set BEFORE importing torch.")
    print("="*70)
    raise RuntimeError("Please restart kernel and run this cell first!")

# STEP 1: Find free GPUs BEFORE importing torch
import os
import subprocess

def find_free_gpus():
    """Find GPUs with low memory usage and available"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=index,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
            capture_output=True, text=True, check=True, timeout=5
        )
        free_gpus = []
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                parts = [p.strip() for p in line.split(',')]
                gpu_id = int(parts[0])
                used = int(parts[1])
                total = int(parts[2])
                util = parts[3] if len(parts) > 3 else '0'
                
                usage_pct = (used / total) * 100
                free_mb = total - used
                
                # More conservative: require < 5% usage AND > 35GB free AND not busy
                if usage_pct < 5 and free_mb > 35000 and util != '[N/A]':
                    try:
                        util_val = int(util) if util != '[N/A]' else 100
                        if util_val < 10:  # Less than 10% utilization
                            free_gpus.append(gpu_id)
                    except:
                        pass
        
        # If no GPUs found with strict criteria, be more lenient
        if not free_gpus:
            print("⚠️  No GPUs found with strict criteria, using lenient check...")
            for line in result.stdout.strip().split('\n'):
                if line.strip():
                    parts = [p.strip() for p in line.split(',')]
                    gpu_id = int(parts[0])
                    used = int(parts[1])
                    total = int(parts[2])
                    usage_pct = (used / total) * 100
                    free_mb = total - used
                    if usage_pct < 10 and free_mb > 30000:
                        free_gpus.append(gpu_id)
        
        return free_gpus if free_gpus else [5]  # Default to GPU 5 if none found
    except Exception as e:
        print(f"Warning: Could not check GPU status: {e}")
        return [5]  # Default safe GPU

free_gpus = find_free_gpus()

# STEP 2: Set CUDA_VISIBLE_DEVICES BEFORE importing torch
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, free_gpus))
print(f"\n✓ Set CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']}")
print(f"✓ Free GPUs detected: {free_gpus} ({len(free_gpus)} GPU{'s' if len(free_gpus) != 1 else ''})")

# STEP 3: NOW import torch (it will only see the free GPUs)
import torch

print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"cuDNN version: {torch.backends.cudnn.version()}")
    num_visible = torch.cuda.device_count()
    print(f"✓ Visible GPUs: {num_visible}")
    if num_visible != len(free_gpus):
        print("\n⚠️  WARNING: GPU count mismatch! Restart kernel and run this cell again.")



✓ Set CUDA_VISIBLE_DEVICES=1,3,5
✓ Free GPUs detected: [1, 3, 5] (3 GPUs)

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
cuDNN version: 90100
✓ Visible GPUs: 3


In [3]:
# Display detailed GPU information

# Check if CUDA_VISIBLE_DEVICES was set (should be set in Cell 1)
cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if not cuda_visible:
    print("\n" + "="*70)
    print("⚠️  WARNING: CUDA_VISIBLE_DEVICES is not set!")
    print("="*70)
    print("\nYou MUST:")
    print("  1. Restart the kernel (Kernel → Restart Kernel)")
    print("  2. Run Cell 1 FIRST (it sets CUDA_VISIBLE_DEVICES)")
    print("  3. Then run this cell")
    print("Without CUDA_VISIBLE_DEVICES, PyTorch will see all 8 GPUs")
    print("and may encounter initialization errors.")
    print("="*70)
    raise RuntimeError("Please run Cell 1 first after restarting kernel!")

if torch.cuda.is_available():
    print("\n" + "="*60)
    print("GPU Information (Only Free GPUs are Visible)")
    print("="*60)
    
    num_gpus = torch.cuda.device_count()
    print(f"\n✓ Number of visible GPUs: {num_gpus}")
    print(f"✓ CUDA_VISIBLE_DEVICES: {cuda_visible}")
    
    # Verify GPU count matches expected
    expected_gpus = len(cuda_visible.split(','))
    if num_gpus != expected_gpus:
        print(f"\n⚠️  WARNING: Expected {expected_gpus} GPUs but PyTorch sees {num_gpus}")
        print("   This might mean Cell 1 didn't run properly. Restart kernel and run Cell 1 first.")
    
    # Show all GPU status from nvidia-smi
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=index,name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
            capture_output=True, text=True, check=True, timeout=5
        )
        print("\nAll GPU Status (from nvidia-smi):")
        print(result.stdout)
    except Exception as e:
        print(f"Could not get GPU status: {e}")
    
    # Show PyTorch visible GPUs (with error handling)
    print("\nPyTorch Visible GPUs:")
    for i in range(num_gpus):
        try:
            print(f"\n  GPU {i}:")
            print(f"    Name: {torch.cuda.get_device_name(i)}")
            props = torch.cuda.get_device_properties(i)
            print(f"    Memory: {props.total_memory / 1024**3:.2f} GB")
            print(f"    Compute Capability: {props.major}.{props.minor}")
        except Exception as e:
            print(f"    ⚠️  Error accessing GPU {i}: {str(e)[:100]}...")
            print(f"    This GPU may be busy or have initialization issues.")
    
    # Set default device
    device = torch.device("cuda:0")
    print(f"\n✓ Default device set to: {device}")
else:
    print("CUDA is not available.")
    device = torch.device("cpu")



GPU Information (Only Free GPUs are Visible)

✓ Number of visible GPUs: 3
✓ CUDA_VISIBLE_DEVICES: 1,3,5

All GPU Status (from nvidia-smi):
0, NVIDIA A100-PCIE-40GB, 18679, 40960, 0
1, NVIDIA A100-PCIE-40GB, 463, 40960, 0
2, NVIDIA A100-PCIE-40GB, 4, 40960, [N/A]
3, NVIDIA A100-PCIE-40GB, 431, 40960, 0
4, NVIDIA A100-PCIE-40GB, 9496, 40960, 33
5, NVIDIA A100-PCIE-40GB, 4, 40960, 0
6, NVIDIA A100-PCIE-40GB, 40401, 40960, 100
7, NVIDIA A100-PCIE-40GB, 28733, 40960, 0


PyTorch Visible GPUs:

  GPU 0:
    Name: NVIDIA A100-PCIE-40GB
    Memory: 39.49 GB
    Compute Capability: 8.0

  GPU 1:
    Name: NVIDIA A100-PCIE-40GB
    Memory: 39.49 GB
    Compute Capability: 8.0

  GPU 2:
    Name: NVIDIA A100-PCIE-40GB
    Memory: 39.49 GB
    Compute Capability: 8.0

✓ Default device set to: cuda:0


In [4]:
# Set default device (if not already set)
if 'device' not in globals():
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"✓ Set device: {device}")
    else:
        device = torch.device("cpu")
        print("CUDA not available, using CPU")
else:
    print(f"Using existing device: {device}")

# Clear GPU cache before testing
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")

# Example: GPU testing (using single GPU for reliability)
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"\n✓ Available GPUs: {num_gpus}")
    
    # Check memory on each GPU
    print("\nGPU Memory Status:")
    for i in range(num_gpus):
        try:
            props = torch.cuda.get_device_properties(i)
            allocated = torch.cuda.memory_allocated(i) / 1e9
            reserved = torch.cuda.memory_reserved(i) / 1e9
            total = props.total_memory / 1e9
            free = total - reserved
            print(f"  GPU {i}: {free:.1f} GB free (total: {total:.1f} GB, reserved: {reserved:.2f} GB)")
        except Exception as e:
            print(f"  GPU {i}: Error checking memory - {e}")
    
    # Example model (small for testing)
    class SimpleModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = torch.nn.Linear(100, 100)  # Small model
        
        def forward(self, x):
            return self.linear(x)
    
    # Use single GPU mode (more reliable than DataParallel)
    print("\n✓ Using single GPU mode for reliability")
    model = SimpleModel().to(device)
    print(f"✓ Model moved to device: {next(model.parameters()).device}")
    
    # Test forward pass
    try:
        test_input = torch.randn(8, 100, device=device)
        output = model(test_input)
        print(f"\n✓ Forward pass successful!")
        print(f"  Input shape: {test_input.shape}")
        print(f"  Output shape: {output.shape}")
        print(f"  Output device: {output.device}")
        
        # Clean up
        del test_input, output
        torch.cuda.empty_cache()
        print("✓ Test tensors cleaned up")
        
        # Optional: Test DataParallel if user wants
        print("\n--- Optional: Testing DataParallel ---")
        if num_gpus > 1:
            try:
                torch.cuda.empty_cache()
                parallel_model = torch.nn.DataParallel(SimpleModel()).to(device)
                test_input = torch.randn(4, 100, device=device)  # Smaller batch for DataParallel
                output = parallel_model(test_input)
                print(f"✓ DataParallel test successful! Output: {output.shape}")
                del parallel_model, test_input, output
                torch.cuda.empty_cache()
            except Exception as e:
                print(f"⚠️  DataParallel test failed: {e}")
                print("  Single GPU mode works fine - use that for your work.")
        else:
            print("  Only 1 GPU available, skipping DataParallel test")
            
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"\n✗ Out of memory error: {e}")
            print("\n  Troubleshooting:")
            print("    1. Check GPU status: nvidia-smi")
            print("    2. Some GPUs may appear free but have reserved memory")
            print("    3. Try using a different GPU or reduce batch size")
        else:
            print(f"\n✗ Forward pass failed: {e}")
    except Exception as e:
        print(f"\n✗ Unexpected error: {e}")
else:
    print("CUDA not available - cannot test GPU functionality")


Using existing device: cuda:0
✓ GPU cache cleared

✓ Available GPUs: 3

GPU Memory Status:
  GPU 0: 42.4 GB free (total: 42.4 GB, reserved: 0.00 GB)
  GPU 1: 42.4 GB free (total: 42.4 GB, reserved: 0.00 GB)
  GPU 2: 42.4 GB free (total: 42.4 GB, reserved: 0.00 GB)

✓ Using single GPU mode for reliability
✓ Model moved to device: cuda:0

✓ Forward pass successful!
  Input shape: torch.Size([8, 100])
  Output shape: torch.Size([8, 100])
  Output device: cuda:0
✓ Test tensors cleaned up

--- Optional: Testing DataParallel ---
✓ DataParallel test successful! Output: torch.Size([4, 100])


  return F.linear(input, self.weight, self.bias)


In [5]:
# Example: Setup for multi-GPU training
def setup_multi_gpu(model, device_ids=None):
    """
    Setup model for multi-GPU training
    
    Args:
        model: PyTorch model
        device_ids: List of GPU IDs to use (default: all available)
    
    Returns:
        Model wrapped in DataParallel
    """
    if not torch.cuda.is_available():
        return model.to(torch.device("cpu"))
    
    if device_ids is None:
        device_ids = list(range(torch.cuda.device_count()))
    
    if len(device_ids) > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)
        print(f"Model wrapped in DataParallel for GPUs: {device_ids}")
    else:
        model = model.to(torch.device(f"cuda:{device_ids[0]}"))
        print(f"Model moved to GPU {device_ids[0]}")
    
    return model

# Example usage:
# model = YourModel()
# model = setup_multi_gpu(model)  # Use all GPUs
# model = setup_multi_gpu(model, device_ids=[0, 1, 2, 3])  # Use specific GPUs


## Environment Variables

You can control GPU visibility using environment variables:


In [10]:
!pip -q install "transformers>=4.44" accelerate torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
!pip -q install pillow opencv-python rapidfuzz tqdm matplotlib

In [None]:
!pip install datasets


In [6]:
# Point to the folder that contains dataset_dict.json (NOT directly to train/)
VQARAD_PATH = "/home/namrah/project/data/VQA_RAD"

# Do the same for SLAKE when you place it (adjust this to your actual path):
SLAKE_PATH  = "/home/namrah/project/data/SLAKE"

OUT_DIR = "/home/namrah/project/data/outputs_cfproxy"
!mkdir -p "$OUT_DIR"

In [9]:
from datasets import load_from_disk

vqa_rad = load_from_disk(VQARAD_PATH)  # returns a DatasetDict because dataset_dict.json exists
print(vqa_rad)                         # Expect: DatasetDict with 'train' and 'test'

# Pick a split
vqa_rad_train = vqa_rad["train"]
vqa_rad_test  = vqa_rad["test"]

# Quick sanity check
row = vqa_rad_test[0]
print(row.keys())       # -> dict_keys(['image','question','answer', ...])
print(row["question"], "->", row["answer"])
row["image"].show()

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 1793
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 451
    })
})
dict_keys(['image', 'question', 'answer'])
is there evidence of an aortic aneurysm? -> yes


In [11]:
from pathlib import Path
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

from datasets import load_from_disk
vqa = load_from_disk(VQARAD_PATH)
ds_train, ds_test = vqa["train"], vqa["test"]

def iter_samples(ds, n=None):
    m = len(ds) if n is None else min(n, len(ds))
    for i in range(m):
        r = ds[i]
        yield i, r["image"], r["question"], r["answer"]

In [14]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import os
from huggingface_hub.utils import HfHubHTTPError

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"

print(f"Loading model: {MODEL_ID}")
print(f"Device: {DEVICE}")

# Method 1: Try loading with error handling for missing optional files
try:
    processor = AutoProcessor.from_pretrained(
        MODEL_ID,
        trust_remote_code=True
    )
    print("✓ Processor loaded successfully")
except (HfHubHTTPError, Exception) as e:
    if "404" in str(e) or "additional_chat_templates" in str(e):
        print(f"⚠️  Warning: Optional file missing ({e}), continuing anyway...")
        # Try loading without the optional file
        try:
            # Set environment variable to skip optional files
            os.environ["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = "1"
            processor = AutoProcessor.from_pretrained(
                MODEL_ID,
                trust_remote_code=True,
                local_files_only=False
            )
            print("✓ Processor loaded (ignoring missing optional files)")
        except Exception as e2:
            print(f"✗ Could not load processor: {e2}")
            raise
    else:
        raise

# Load model
try:
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
        trust_remote_code=True
    ).eval()
    print("✓ Model loaded successfully")
    print(f"✓ Model on device: {next(model.parameters()).device}")
except Exception as e:
    print(f"⚠️  Error with device_map='auto': {e}")
    print("Trying manual device placement...")
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
        trust_remote_code=True
    )
    model = model.to(DEVICE).eval()
    print("✓ Model loaded and moved to device")


Loading model: Qwen/Qwen2-VL-2B-Instruct
Device: cuda

Entry Not Found for url: https://huggingface.co/api/models/Qwen/Qwen2-VL-2B-Instruct/tree/main/additional_chat_templates?recursive=false&expand=false.
additional_chat_templates does not exist on "main"), continuing anyway...
✗ Could not load processor: 404 Client Error. (Request ID: Root=1-691b89b2-6e0156535b770da2200d5812;4e982243-0128-425c-8e7c-2b7d756cdc8e)

Entry Not Found for url: https://huggingface.co/api/models/Qwen/Qwen2-VL-2B-Instruct/tree/main/additional_chat_templates?recursive=false&expand=false.
additional_chat_templates does not exist on "main"


RemoteEntryNotFoundError: 404 Client Error. (Request ID: Root=1-691b89b2-6e0156535b770da2200d5812;4e982243-0128-425c-8e7c-2b7d756cdc8e)

Entry Not Found for url: https://huggingface.co/api/models/Qwen/Qwen2-VL-2B-Instruct/tree/main/additional_chat_templates?recursive=false&expand=false.
additional_chat_templates does not exist on "main"

In [None]:
def prompt_think_answer(q):
    return f'''You are a medical VQA assistant. Think carefully but return ONLY valid JSON:
{{"answer":"<short answer>"}}

Rules:
- Do not use code fences or backticks.
- No extra keys or text beyond the JSON.
- If your output is not valid JSON, immediately try again and output ONLY valid JSON.
Question: "{q}"'''

def prompt_caption_reason_answer(q):
    return f'''You are a medical VQA assistant. First DESCRIBE the image, then REASON, then ANSWER.
Return ONLY valid JSON with exactly these keys:
{{
 "caption":"<1-2 precise sentences about visible anatomy/findings>",
 "reasoning":["<step1>","<step2>","<step3>"],
 "boxes":[[x1,y1,x2,y2]],
 "answer":"<short answer>"
}}
Rules:
- Do not use code fences or backticks.
- No extra keys or text beyond the JSON.
- Use integers for box coordinates within image bounds.
- If your output is not valid JSON, immediately try again and output ONLY valid JSON.
- Output at most 1 box tightly enclosing the most diagnostic finding (avoid full-organ boxes).
- The box must be as small as possible while still covering the key evidence.

Question: "{q}"'''

In [15]:
import torch
import gc

# Delete variables
if 'model' in globals():
    del model
if 'processor' in globals():
    del processor

# Clear cache
gc.collect()
torch.cuda.empty_cache()

print("GPU memory freed!")

GPU memory freed!
