In [1]:
# 1. System Information Gathering
!nvidia-smi  # Check if GPU is recognized at system level
!nvcc --version  # Check CUDA compiler version
!python -c "import tensorflow as tf; print(tf.__version__)"  # TF version
!python -c "import tensorflow as tf; print('Num GPUs Available:', len(tf.config.list_physical_devices('GPU')))"

# 2. Detailed GPU Configuration Check
import tensorflow as tf
import os

def check_gpu_configuration():
    # Check if TF can see the GPU
    gpus = tf.config.list_physical_devices('GPU')
    print("\n=== GPU Devices ===")
    print(f"Found {len(gpus)} GPU(s)")
    for gpu in gpus:
        print(f"Name: {gpu.name}, Type: {gpu.device_type}")
    
    # Check if CUDA is properly linked
    print("\n=== CUDA Configuration ===")
    print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not Set')}")
    print(f"XLA_FLAGS: {os.environ.get('XLA_FLAGS', 'Not Set')}")
    
    # Test GPU memory allocation
    print("\n=== Memory Allocation Test ===")
    try:
        with tf.device('/GPU:0'):
            a = tf.random.normal([1000, 1000])
            b = tf.random.normal([1000, 1000])
            c = tf.matmul(a, b)
            print("Successfully allocated GPU memory and performed computation")
    except Exception as e:
        print(f"GPU memory allocation failed: {str(e)}")
    
    # Check if GPU is actually being used
    print("\n=== Device Placement ===")
    print("Checking where operations are being placed...")
    
    @tf.function
    def test_func():
        with tf.device('/GPU:0'):
            a = tf.random.normal([1000, 1000])
            b = tf.random.normal([1000, 1000])
            return tf.matmul(a, b)
    
    with tf.device('/GPU:0'):
        result = test_func()
        print("Operation successfully ran on GPU")

# Run all checks
check_gpu_configuration()

# 3. Environment Variable Check Command
import os
print("=== Environment Variables ===")
print(f"CUDA_HOME: {os.environ.get('CUDA_HOME', 'Not Set')}")
print(f"CUDA_PATH: {os.environ.get('CUDA_PATH', 'Not Set')}")  # Windows often uses this instead
print(f"PATH: {[p for p in os.environ.get('PATH', '').split(';') if 'cuda' in p.lower()]}")

# 4. Test GPU Performance
import tensorflow as tf
import time

def test_gpu_performance():
    # Create large tensors
    size = 5000
    with tf.device('/GPU:0'):
        # Warm-up
        a = tf.random.normal([size, size])
        b = tf.random.normal([size, size])
        tf.matmul(a, b)
        
        # Actual test
        start_time = time.time()
        for _ in range(10):
            tf.matmul(a, b)
        end_time = time.time()
        
        print(f"\n=== GPU Performance Test ===")
        print(f"Time taken for 10 {size}x{size} matrix multiplications: {end_time - start_time:.2f} seconds")

test_gpu_performance()

Sun Nov 17 02:12:37 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.72                 Driver Version: 566.14         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070        On  |   00000000:2B:00.0  On |                  N/A |
| 30%   38C    P8             21W /  220W |     703MiB /   8192MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

2024-11-17 02:12:41.655401: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 02:12:41.655457: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 02:12:41.655485: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 02:12:41.662348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



=== GPU Devices ===
Found 1 GPU(s)
Name: /physical_device:GPU:0, Type: GPU

=== CUDA Configuration ===
CUDA_VISIBLE_DEVICES: Not Set
XLA_FLAGS: Not Set

=== Memory Allocation Test ===


2024-11-17 02:12:43.148541: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 02:12:43.152764: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 02:12:43.152803: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 02:12:43.156737: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-17 02:12:43.156781: I tensorflow/compile

Successfully allocated GPU memory and performed computation

=== Device Placement ===
Checking where operations are being placed...
Operation successfully ran on GPU
=== Environment Variables ===
CUDA_HOME: Not Set
CUDA_PATH: Not Set
PATH: ['/home/megger/miniconda3/envs/pressure_predict/bin:/home/megger/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/usr/lib/wsl/lib:/mnt/c/ProgramData/miniconda3/condabin:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/bin:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/libnvvp:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/lib:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.2/lib/x64:/mnt/c/Windows/system32:/mnt/c/Program Files/Common Files/Oracle/Java/javapath:/mnt/c/Program Files (x86)/Common Files/Oracle/Java/javapath:/mnt/c/Program Files (x86)/Common Files/Intel/Shared Libraries/redist/intel64/compiler:/mnt/c/Windows:/mnt/c/Windows/Syste

In [2]:
import tensorflow as tf
import os
import sys

def check_gpu_setup():
    """
    Comprehensive check of TensorFlow GPU setup and configuration.
    Prints detailed information about the environment and GPU availability.
    """
    print("=" * 50)
    print("TensorFlow Version:", tf.__version__)
    print("Python Version:", sys.version)
    
    # Check if GPU is visible to TensorFlow
    print("\nGPU Devices:")
    print(tf.config.list_physical_devices('GPU'))
    
    # Check CUDA availability
    print("\nCUDA Visible Devices:", os.environ.get('CUDA_VISIBLE_DEVICES', 'Not Set'))
    
    # Try to perform a simple GPU operation
    try:
        with tf.device('/GPU:0'):
            a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
            b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            c = tf.matmul(a, b)
            print("\nGPU Test Operation Result:")
            print(c)
            print("\nGPU test passed successfully!")
    except Exception as e:
        print("\nGPU test failed with error:")
        print(str(e))
    
    # Print GPU memory info if available
    try:
        print("\nGPU Memory Details:")
        print(tf.config.experimental.get_memory_info('GPU:0'))
    except:
        print("\nUnable to get GPU memory details")
    
    # Check if CUDA is built with TensorFlow
    print("\nBuilt with CUDA:", tf.test.is_built_with_cuda())
    
    if hasattr(tf.config.experimental, 'get_device_details'):
        for device in tf.config.list_physical_devices('GPU'):
            details = tf.config.experimental.get_device_details(device)
            print("\nGPU Details:", details)

if __name__ == "__main__":
    check_gpu_setup()

TensorFlow Version: 2.14.1
Python Version: 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:50:21) 
[GCC 12.3.0]

GPU Devices:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

CUDA Visible Devices: Not Set

GPU Test Operation Result:
tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)

GPU test passed successfully!

GPU Memory Details:
{'current': 2560, 'peak': 400001792}

Built with CUDA: True

GPU Details: {'compute_capability': (8, 6), 'device_name': 'NVIDIA GeForce RTX 3070'}


2024-11-17 02:13:00.214925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.


In [1]:
# DELETE SMALL LOG DIRECTORIES

import os
import shutil

def delete_small_directories(root_dir: str, size_threshold: int):
    """
    Traverse a directory and delete all subdirectories smaller than the given size.

    Parameters:
        root_dir (str): Path to the root directory.
        size_threshold (int): Size threshold in bytes. Subdirectories smaller than this will be deleted.
    """
    for dirpath, dirnames, filenames in os.walk(root_dir, topdown=False):
        for dirname in dirnames:
            subdir_path = os.path.join(dirpath, dirname)
            total_size = sum(
                os.path.getsize(os.path.join(root, file))
                for root, _, files in os.walk(subdir_path)
                for file in files
            )
            if total_size < size_threshold:
                shutil.rmtree(subdir_path)
                print(f"Deleted: {subdir_path} (size: {total_size} bytes)")

delete_small_directories("./logs", 6000)

Deleted: ./logs/20241117-183429 (size: 2459 bytes)


In [None]:

    # def consistency_regularization(self, real_images, fake_images):
    #     """Additional regularization term for stability"""
    #     # Add small random perturbation
    #     epsilon = 1e-3
    #     perturbed_real = real_images + tf.random.normal(tf.shape(real_images)) * epsilon
    #     perturbed_fake = fake_images + tf.random.normal(tf.shape(fake_images)) * epsilon
        
    #     # Get discriminator outputs for perturbed images
    #     d_real = self.discriminator([perturbed_real, real_images])
    #     d_fake = self.discriminator([perturbed_fake, fake_images])
        
    #     # Compute consistency loss
    #     consistency_loss = (
    #         tf.reduce_mean(tf.square(d_real - self.discriminator([real_images, real_images]))) +
    #         tf.reduce_mean(tf.square(d_fake - self.discriminator([fake_images, fake_images])))
    #     )
        
    #     return consistency_loss