In [1]:
# Compare all timings in one place
import tensorflow as tf
import time
import numpy as np

size = 2048
results = {}

# CPU baseline
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = tf.matmul(a, b)
    _ = c.numpy()
    cpu_time = time.time() - start
    results['CPU'] = cpu_time
    print(f"CPU matmul time: {cpu_time:.4f} seconds")

# GPU baseline
if tf.config.list_physical_devices('GPU'):
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = tf.matmul(a, b)
        _ = c.numpy()
        gpu_time = time.time() - start
        results['GPU'] = gpu_time
        print(f"GPU matmul time: {gpu_time:.4f} seconds")

# XLA on GPU
if tf.config.list_physical_devices('GPU'):
    @tf.function(jit_compile=True)
    def xla_gpu(a, b):
        return tf.matmul(a, b)
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = xla_gpu(a, b)
        _ = c.numpy()
        xla_gpu_time = time.time() - start
        results['XLA_GPU'] = xla_gpu_time
        print(f"XLA-optimized GPU matmul time: {xla_gpu_time:.4f} seconds")

# XLA on CPU
@tf.function(jit_compile=True)
def xla_cpu(a, b):
    return tf.matmul(a, b)
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = xla_cpu(a, b)
    _ = c.numpy()
    xla_cpu_time = time.time() - start
    results['XLA_CPU'] = xla_cpu_time
    print(f"XLA-optimized CPU matmul time: {xla_cpu_time:.4f} seconds")

print("\nSummary:")
for k, v in results.items():
    print(f"{k}: {v:.4f} seconds")

I0000 00:00:1768159840.824615     119 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1768159840.865719     119 cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1768159841.963600     119 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
W0000 00:00:1768159842.836805     154 cuda_executor.cc:1767] Failed to determine cuDNN version (Note that this is expected if the appl

CPU matmul time: 0.0782 seconds
XLA-optimized CPU matmul time: 0.0878 seconds

Summary:
CPU: 0.0782 seconds
XLA_CPU: 0.0878 seconds


# Compilation Overhead and Repeated Benchmarking
This cell will run each operation multiple times to show the effect of TensorFlow's graph/XLA compilation and warm-up. The first run may be slower due to compilation.

In [2]:
import tensorflow as tf
import time
import numpy as np

size = 1024
repeats = 5

# Prepare data
cpu_a = tf.random.normal((size, size))
cpu_b = tf.random.normal((size, size))

# Only create GPU tensors if GPU is available
if tf.config.list_physical_devices('GPU'):
    gpu_a = tf.random.normal((size, size))
    gpu_b = tf.random.normal((size, size))
else:
    gpu_a = gpu_b = None

# Define XLA function
@tf.function(jit_compile=True)
def xla_matmul(a, b):
    return tf.matmul(a, b)

def bench(fn, a, b, device, label):
    times = []
    for i in range(repeats):
        with tf.device(device):
            start = time.time()
            c = fn(a, b)
            _ = c.numpy()
            elapsed = time.time() - start
            times.append(elapsed)
            print(f"{label} run {i+1}: {elapsed:.4f} seconds")
    print(f"{label} avg: {np.mean(times):.4f} seconds\n")

print("CPU baseline:")
bench(tf.matmul, cpu_a, cpu_b, '/CPU:0', 'CPU')

if tf.config.list_physical_devices('GPU'):
    print("GPU baseline:")
    bench(tf.matmul, gpu_a, gpu_b, '/GPU:0', 'GPU')
    print("XLA-optimized GPU:")
    bench(xla_matmul, gpu_a, gpu_b, '/GPU:0', 'XLA_GPU')
else:
    print("XLA-optimized CPU:")
    bench(xla_matmul, cpu_a, cpu_b, '/CPU:0', 'XLA_CPU')

CPU baseline:
CPU run 1: 0.0156 seconds
CPU run 2: 0.0178 seconds
CPU run 3: 0.0124 seconds
CPU run 4: 0.0101 seconds
CPU run 5: 0.0110 seconds
CPU avg: 0.0134 seconds

XLA-optimized CPU:
XLA_CPU run 1: 0.0276 seconds
XLA_CPU run 2: 0.0062 seconds
XLA_CPU run 3: 0.0054 seconds
XLA_CPU run 4: 0.0048 seconds
XLA_CPU run 5: 0.0054 seconds
XLA_CPU avg: 0.0099 seconds

