In [2]:
# Compare all timings in one place
import tensorflow as tf
import time
import numpy as np

size = 2048
results = {}

# CPU baseline
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = tf.matmul(a, b)
    _ = c.numpy()
    cpu_time = time.time() - start
    results['CPU'] = cpu_time
    print(f"CPU matmul time: {cpu_time:.4f} seconds")

# GPU baseline
if tf.config.list_physical_devices('GPU'):
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = tf.matmul(a, b)
        _ = c.numpy()
        gpu_time = time.time() - start
        results['GPU'] = gpu_time
        print(f"GPU matmul time: {gpu_time:.4f} seconds")

# XLA on GPU
if tf.config.list_physical_devices('GPU'):
    @tf.function(jit_compile=True)
    def xla_gpu(a, b):
        return tf.matmul(a, b)
    with tf.device('/GPU:0'):
        a = tf.random.normal((size, size))
        b = tf.random.normal((size, size))
        start = time.time()
        c = xla_gpu(a, b)
        _ = c.numpy()
        xla_gpu_time = time.time() - start
        results['XLA_GPU'] = xla_gpu_time
        print(f"XLA-optimized GPU matmul time: {xla_gpu_time:.4f} seconds")

# XLA on CPU
@tf.function(jit_compile=True)
def xla_cpu(a, b):
    return tf.matmul(a, b)
with tf.device('/CPU:0'):
    a = tf.random.normal((size, size))
    b = tf.random.normal((size, size))
    start = time.time()
    c = xla_cpu(a, b)
    _ = c.numpy()
    xla_cpu_time = time.time() - start
    results['XLA_CPU'] = xla_cpu_time
    print(f"XLA-optimized CPU matmul time: {xla_cpu_time:.4f} seconds")

print("\nSummary:")
for k, v in results.items():
    print(f"{k}: {v:.4f} seconds")

CPU matmul time: 0.0554 seconds
GPU matmul time: 0.0053 seconds
XLA-optimized GPU matmul time: 0.0124 seconds
XLA-optimized CPU matmul time: 0.0633 seconds

Summary:
CPU: 0.0554 seconds
GPU: 0.0053 seconds
XLA_GPU: 0.0124 seconds
XLA_CPU: 0.0633 seconds


# Compilation Overhead and Repeated Benchmarking
This cell will run each operation multiple times to show the effect of TensorFlow's graph/XLA compilation and warm-up. The first run may be slower due to compilation.

In [3]:
import tensorflow as tf
import time
import numpy as np

size = 1024
repeats = 5

# Prepare data
cpu_a = tf.random.normal((size, size))
cpu_b = tf.random.normal((size, size))

# Only create GPU tensors if GPU is available
if tf.config.list_physical_devices('GPU'):
    gpu_a = tf.random.normal((size, size))
    gpu_b = tf.random.normal((size, size))
else:
    gpu_a = gpu_b = None

# Define XLA function
@tf.function(jit_compile=True)
def xla_matmul(a, b):
    return tf.matmul(a, b)

def bench(fn, a, b, device, label):
    times = []
    for i in range(repeats):
        with tf.device(device):
            start = time.time()
            c = fn(a, b)
            _ = c.numpy()
            elapsed = time.time() - start
            times.append(elapsed)
            print(f"{label} run {i+1}: {elapsed:.4f} seconds")
    print(f"{label} avg: {np.mean(times):.4f} seconds\n")

print("CPU baseline:")
bench(tf.matmul, cpu_a, cpu_b, '/CPU:0', 'CPU')

if tf.config.list_physical_devices('GPU'):
    print("GPU baseline:")
    bench(tf.matmul, gpu_a, gpu_b, '/GPU:0', 'GPU')
    print("XLA-optimized GPU:")
    bench(xla_matmul, gpu_a, gpu_b, '/GPU:0', 'XLA_GPU')
else:
    print("XLA-optimized CPU:")
    bench(xla_matmul, cpu_a, cpu_b, '/CPU:0', 'XLA_CPU')

CPU baseline:
CPU run 1: 0.0171 seconds
CPU run 2: 0.0167 seconds
CPU run 3: 0.0145 seconds
CPU run 4: 0.0134 seconds
CPU run 5: 0.0110 seconds
CPU avg: 0.0145 seconds

GPU baseline:
GPU run 1: 0.0030 seconds
GPU run 2: 0.0019 seconds
GPU run 3: 0.0020 seconds
GPU run 4: 0.0017 seconds
GPU run 5: 0.0017 seconds
GPU avg: 0.0021 seconds

XLA-optimized GPU:
XLA_GPU run 1: 0.0226 seconds
XLA_GPU run 2: 0.0020 seconds
XLA_GPU run 3: 0.0039 seconds
XLA_GPU run 4: 0.0017 seconds
XLA_GPU run 5: 0.0016 seconds
XLA_GPU avg: 0.0064 seconds

