In [4]:
# 1. System Information Gathering
!nvidia-smi  # Check if GPU is recognized at system level
!nvcc --version  # Check CUDA compiler version
!python -c "import tensorflow as tf; print(tf.__version__)"  # TF version
!python -c "import tensorflow as tf; print('Num GPUs Available:', len(tf.config.list_physical_devices('GPU')))"

# 2. Detailed GPU Configuration Check
import tensorflow as tf
import os

def check_gpu_configuration():
    # Check if TF can see the GPU
    gpus = tf.config.list_physical_devices('GPU')
    print("\n=== GPU Devices ===")
    print(f"Found {len(gpus)} GPU(s)")
    for gpu in gpus:
        print(f"Name: {gpu.name}, Type: {gpu.device_type}")
    
    # Check if CUDA is properly linked
    print("\n=== CUDA Configuration ===")
    print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not Set')}")
    print(f"XLA_FLAGS: {os.environ.get('XLA_FLAGS', 'Not Set')}")
    
    # Test GPU memory allocation
    print("\n=== Memory Allocation Test ===")
    try:
        with tf.device('/GPU:0'):
            a = tf.random.normal([1000, 1000])
            b = tf.random.normal([1000, 1000])
            c = tf.matmul(a, b)
            print("Successfully allocated GPU memory and performed computation")
    except Exception as e:
        print(f"GPU memory allocation failed: {str(e)}")
    
    # Check if GPU is actually being used
    print("\n=== Device Placement ===")
    print("Checking where operations are being placed...")
    
    @tf.function
    def test_func():
        with tf.device('/GPU:0'):
            a = tf.random.normal([1000, 1000])
            b = tf.random.normal([1000, 1000])
            return tf.matmul(a, b)
    
    with tf.device('/GPU:0'):
        result = test_func()
        print("Operation successfully ran on GPU")

# Run all checks
check_gpu_configuration()

# 3. Environment Variable Check Command
import os
print("=== Environment Variables ===")
print(f"CUDA_HOME: {os.environ.get('CUDA_HOME', 'Not Set')}")
print(f"CUDA_PATH: {os.environ.get('CUDA_PATH', 'Not Set')}")  # Windows often uses this instead
print(f"PATH: {[p for p in os.environ.get('PATH', '').split(';') if 'cuda' in p.lower()]}")

# 4. Test GPU Performance
import tensorflow as tf
import time

def test_gpu_performance():
    # Create large tensors
    size = 5000
    with tf.device('/GPU:0'):
        # Warm-up
        a = tf.random.normal([size, size])
        b = tf.random.normal([size, size])
        tf.matmul(a, b)
        
        # Actual test
        start_time = time.time()
        for _ in range(10):
            tf.matmul(a, b)
        end_time = time.time()
        
        print(f"\n=== GPU Performance Test ===")
        print(f"Time taken for 10 {size}x{size} matrix multiplications: {end_time - start_time:.2f} seconds")

test_gpu_performance()

Thu Dec  5 12:13:40 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.72                 Driver Version: 566.14         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070        On  |   00000000:2B:00.0  On |                  N/A |
| 30%   61C    P2             94W /  132W |    7906MiB /   8192MiB |     13%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import tensorflow as tf
import os
import sys

def check_gpu_setup():
    """
    Comprehensive check of TensorFlow GPU setup and configuration.
    Prints detailed information about the environment and GPU availability.
    """
    print("=" * 50)
    print("TensorFlow Version:", tf.__version__)
    print("Python Version:", sys.version)
    
    # Check if GPU is visible to TensorFlow
    print("\nGPU Devices:")
    print(tf.config.list_physical_devices('GPU'))
    
    # Check CUDA availability
    print("\nCUDA Visible Devices:", os.environ.get('CUDA_VISIBLE_DEVICES', 'Not Set'))
    
    # Try to perform a simple GPU operation
    try:
        with tf.device('/GPU:0'):
            a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
            b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            c = tf.matmul(a, b)
            print("\nGPU Test Operation Result:")
            print(c)
            print("\nGPU test passed successfully!")
    except Exception as e:
        print("\nGPU test failed with error:")
        print(str(e))
    
    # Print GPU memory info if available
    try:
        print("\nGPU Memory Details:")
        print(tf.config.experimental.get_memory_info('GPU:0'))
    except:
        print("\nUnable to get GPU memory details")
    
    # Check if CUDA is built with TensorFlow
    print("\nBuilt with CUDA:", tf.test.is_built_with_cuda())
    
    if hasattr(tf.config.experimental, 'get_device_details'):
        for device in tf.config.list_physical_devices('GPU'):
            details = tf.config.experimental.get_device_details(device)
            print("\nGPU Details:", details)

if __name__ == "__main__":
    check_gpu_setup()

TensorFlow Version: 2.14.1
Python Version: 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:50:21) 
[GCC 12.3.0]

GPU Devices:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

CUDA Visible Devices: Not Set

GPU Test Operation Result:
tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)

GPU test passed successfully!

GPU Memory Details:
{'current': 2560, 'peak': 400001792}

Built with CUDA: True

GPU Details: {'compute_capability': (8, 6), 'device_name': 'NVIDIA GeForce RTX 3070'}


2024-11-17 02:13:00.214925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.


In [2]:
# DELETE SMALL LOG DIRECTORIES

import os
import shutil

def delete_small_directories(root_dir: str, size_threshold: int):
    """
    Traverse a directory and delete all subdirectories smaller than the given size.

    Parameters:
        root_dir (str): Path to the root directory.
        size_threshold (int): Size threshold in bytes. Subdirectories smaller than this will be deleted.
    """
    for dirpath, dirnames, filenames in os.walk(root_dir, topdown=False):
        for dirname in dirnames:
            subdir_path = os.path.join(dirpath, dirname)
            total_size = sum(
                os.path.getsize(os.path.join(root, file))
                for root, _, files in os.walk(subdir_path)
                for file in files
            )
            if total_size < size_threshold:
                shutil.rmtree(subdir_path)
                print(f"Deleted: {subdir_path} (size: {total_size} bytes)")

delete_small_directories("./logs", 6000)

Deleted: ./logs/20241118-162136 (size: 78 bytes)
Deleted: ./logs/20241118-162202 (size: 78 bytes)


In [None]:

    # def consistency_regularization(self, real_images, fake_images):
    #     """Additional regularization term for stability"""
    #     # Add small random perturbation
    #     epsilon = 1e-3
    #     perturbed_real = real_images + tf.random.normal(tf.shape(real_images)) * epsilon
    #     perturbed_fake = fake_images + tf.random.normal(tf.shape(fake_images)) * epsilon
        
    #     # Get discriminator outputs for perturbed images
    #     d_real = self.discriminator([perturbed_real, real_images])
    #     d_fake = self.discriminator([perturbed_fake, fake_images])
        
    #     # Compute consistency loss
    #     consistency_loss = (
    #         tf.reduce_mean(tf.square(d_real - self.discriminator([real_images, real_images]))) +
    #         tf.reduce_mean(tf.square(d_fake - self.discriminator([fake_images, fake_images])))
    #     )
        
    #     return consistency_loss

In [2]:
import tensorflow as tf
import tf2onnx
import onnx

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Create a simple model that maintains input dimensions
def create_mock_model():
    # Input layer expecting NCHW format
    inputs = tf.keras.Input(shape=(3, None, None), name='input')
    
    # Simple convolution layers that maintain dimensions
    x = tf.keras.layers.Conv2D(
        filters=3,
        kernel_size=3,
        padding='same',
        data_format='channels_first'
    )(inputs)
    
    x = tf.keras.layers.BatchNormalization(axis=1)(x)  # axis=1 for channels_first
    x = tf.keras.layers.ReLU()(x)
    
    x = tf.keras.layers.Conv2D(
        filters=3,
        kernel_size=3,
        padding='same',
        data_format='channels_first'
    )(x)
    
    # Ensure output has same dimensions as input
    outputs = tf.keras.layers.BatchNormalization(axis=1)(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

def export_to_onnx(model, save_path):
    # Convert the model to ONNX
    input_signature = [tf.TensorSpec([None, 3, None, None], tf.float32, name='input')]
    onnx_model, _ = tf2onnx.convert.from_keras(
        model, 
        input_signature=input_signature,
        opset=13,
        output_path=save_path
    )
    return onnx_model

# Create and compile the model
model = create_mock_model()
model.compile(
    optimizer='adam',
    loss='mse'
)

# Test the model with sample data
sample_data = tf.random.normal((1, 3, 64, 64))
test_output = model(sample_data)
print(f"Input shape: {sample_data.shape}")
print(f"Output shape: {test_output.shape}")

# Export the model
onnx_path = "mock_model.onnx"
onnx_model = export_to_onnx(model, onnx_path)

# Verify the ONNX model
onnx_model = onnx.load(onnx_path)
print("\nONNX Model Input/Output Shapes:")
for input in onnx_model.graph.input:
    print(f"Input: {input.name}, Shape: {[dim.dim_value if dim.dim_value != 0 else None for dim in input.type.tensor_type.shape.dim]}")
for output in onnx_model.graph.output:
    print(f"Output: {output.name}, Shape: {[dim.dim_value if dim.dim_value != 0 else None for dim in output.type.tensor_type.shape.dim]}")

2024-12-05 12:13:26.674414: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2024-12-05 12:13:26.998877: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-12-05 12:13:27.382366: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Input shape: (1, 3, 64, 64)
Output shape: (1, 3, 64, 64)

ONNX Model Input/Output Shapes:
Input: input, Shape: [None, 3, None, None]
Output: batch_normalization_1, Shape: [None, 3, None, None]


2024-12-05 12:13:27.612493: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 12:13:27.612551: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2024-12-05 12:13:27.612718: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-12-05 12:13:27.613153: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 12:13:27.613201: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-05 12:13:27.613261: I tensor