In [1]:
# Compare all timings in one place
import tensorflow as tf
import time
import numpy as np

size = 2048
results = {}

# CPU baseline
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = tf.matmul(a, b)
    _ = c.numpy()
    cpu_time = time.time() - start
    results['CPU'] = cpu_time
    print(f"CPU matmul time: {cpu_time:.4f} seconds")

# GPU baseline
if tf.config.list_physical_devices('GPU'):
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = tf.matmul(a, b)
        _ = c.numpy()
        gpu_time = time.time() - start
        results['GPU'] = gpu_time
        print(f"GPU matmul time: {gpu_time:.4f} seconds")

# XLA on GPU
if tf.config.list_physical_devices('GPU'):
    @tf.function(jit_compile=True)
    def xla_gpu(a, b):
        return tf.matmul(a, b)
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = xla_gpu(a, b)
        _ = c.numpy()
        xla_gpu_time = time.time() - start
        results['XLA_GPU'] = xla_gpu_time
        print(f"XLA-optimized GPU matmul time: {xla_gpu_time:.4f} seconds")

# XLA on CPU
@tf.function(jit_compile=True)
def xla_cpu(a, b):
    return tf.matmul(a, b)
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = xla_cpu(a, b)
    _ = c.numpy()
    xla_cpu_time = time.time() - start
    results['XLA_CPU'] = xla_cpu_time
    print(f"XLA-optimized CPU matmul time: {xla_cpu_time:.4f} seconds")

print("\nSummary:")
for k, v in results.items():
    print(f"{k}: {v:.4f} seconds")

2026-01-11 22:29:20.842987: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-11 22:29:20.880873: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-11 22:29:21.944329: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
W0000 00:00:1768170562.623721     788 gpu_device.cc:2431] TensorFlow was not built with CUD

CPU matmul time: 0.1007 seconds


2026-01-11 22:29:23.327428: W tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc:40] 'cuModuleLoadData(&module, data)' failed with 'CUDA_ERROR_INVALID_PTX'

2026-01-11 22:29:23.327447: W tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc:40] 'cuModuleGetFunction(&function, module, kernel_name)' failed with 'CUDA_ERROR_INVALID_HANDLE'

2026-01-11 22:29:23.327453: W tensorflow/core/framework/op_kernel.cc:1842] INTERNAL: 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE'
2026-01-11 22:29:23.327462: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: INTERNAL: 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE'


InternalError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:GPU:0}} 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE' [Op:Mul] name: 

# Compilation Overhead and Repeated Benchmarking
This cell will run each operation multiple times to show the effect of TensorFlow's graph/XLA compilation and warm-up. The first run may be slower due to compilation.

In [2]:
import tensorflow as tf
import time
import numpy as np

size = 1024
repeats = 5

# Prepare data
cpu_a = tf.random.normal((size, size))
cpu_b = tf.random.normal((size, size))

# Only create GPU tensors if GPU is available
if tf.config.list_physical_devices('GPU'):
    gpu_a = tf.random.normal((size, size))
    gpu_b = tf.random.normal((size, size))
else:
    gpu_a = gpu_b = None

# Define XLA function
@tf.function(jit_compile=True)
def xla_matmul(a, b):
    return tf.matmul(a, b)

def bench(fn, a, b, device, label):
    times = []
    for i in range(repeats):
        with tf.device(device):
            start = time.time()
            c = fn(a, b)
            _ = c.numpy()
            elapsed = time.time() - start
            times.append(elapsed)
            print(f"{label} run {i+1}: {elapsed:.4f} seconds")
    print(f"{label} avg: {np.mean(times):.4f} seconds\n")

print("CPU baseline:")
bench(tf.matmul, cpu_a, cpu_b, '/CPU:0', 'CPU')

if tf.config.list_physical_devices('GPU'):
    print("GPU baseline:")
    bench(tf.matmul, gpu_a, gpu_b, '/GPU:0', 'GPU')
    print("XLA-optimized GPU:")
    bench(xla_matmul, gpu_a, gpu_b, '/GPU:0', 'XLA_GPU')
else:
    print("XLA-optimized CPU:")
    bench(xla_matmul, cpu_a, cpu_b, '/CPU:0', 'XLA_CPU')

CPU baseline:
CPU run 1: 0.0152 seconds
CPU run 2: 0.0138 seconds
CPU run 3: 0.0092 seconds
CPU run 4: 0.0075 seconds
CPU run 5: 0.0091 seconds
CPU avg: 0.0109 seconds

XLA-optimized CPU:
XLA_CPU run 1: 0.0245 seconds
XLA_CPU run 2: 0.0043 seconds
XLA_CPU run 3: 0.0038 seconds
XLA_CPU run 4: 0.0036 seconds
XLA_CPU run 5: 0.0040 seconds
XLA_CPU avg: 0.0080 seconds

