In [1]:
from clf_funcs import setup, PerfCounterCallback, env_builder

import pandas as pd

import tensorflow as tf
from tqdm import trange

setup()

INDEX = 3

2025-01-20 01:17:06.958789: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-20 01:17:33.148693: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-20 01:17:33.511485: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-

In [2]:
telemetry = {
	'framework': [],
	'model_name': [],
	'rep': [],
	'batch_size': [],
	'elapsed_time': []
}

warmup_steps = 100
repetitions = 50
perf_callback = PerfCounterCallback(None, [])

config = {
	'batch_size': 1,
	'test_batch_size': 1,
	'inputs': tf.keras.layers.Input(shape=(32,32,3)),
}

In [3]:
second_run = True

if second_run:
	models = ['MobileNet-v2', 'ConvNeXt-Tiny']
else:
	models = ['FullyConnectedNet', 'SimpleConvNet', 'ResNet-50', 'DenseNet-121']

for model_name in models:
	for batch_size in [1, 16, 32, 64, 96, 128, 192, 256]:
		config["batch_size"] = batch_size
		model, ds, _ = env_builder(model_name, config)

		if isinstance(ds, tuple):
			sample = ds[0][:batch_size]
			sample = tf.convert_to_tensor(sample, dtype=tf.float32)
		else:
			sample = next(iter(ds))[0]

		# model.predict(batch_size=None) =================================================
		for i in trange(warmup_steps, desc=f"Warmup for {model_name} (batch of {batch_size})"):
			_ = model.predict(sample, verbose=0, batch_size=None)

		perf_callback.latency_ref.clear()
		for i in trange(repetitions, desc=f"Predict without bsize for {model_name} (batch of {batch_size})"):
			_ = model.predict(sample, verbose=0, callbacks=[perf_callback], batch_size=None)

			telemetry['framework'].append("TF (batch_size=None)")
			telemetry['model_name'].append(model_name)
			telemetry['rep'].append(i)
			telemetry['batch_size'].append(batch_size)
		telemetry['elapsed_time'].extend(perf_callback.latency_ref)

		# model.predict(batch_size=batch_size) ===========================================
		for i in trange(warmup_steps, desc=f"Warmup for {model_name} (batch of {batch_size})"):
			_ = model.predict(sample, verbose=0, batch_size=batch_size)

		perf_callback.latency_ref.clear()
		for i in trange(repetitions, desc=f"Predict with bsize for {model_name} (batch of {batch_size})"):
			_ = model.predict(sample, verbose=0, callbacks=[perf_callback], batch_size=batch_size)

			telemetry['framework'].append("TF (batch_size=N)")
			telemetry['model_name'].append(model_name)
			telemetry['rep'].append(i)
			telemetry['batch_size'].append(batch_size)
		telemetry['elapsed_time'].extend(perf_callback.latency_ref)

		# model.predict_on_batch() =======================================================
		for i in trange(warmup_steps, desc=f"Warmup for {model_name} (batch of {batch_size})"):
			_ = model.predict_on_batch(sample)

		perf_callback.latency_ref.clear()
		for i in trange(repetitions, desc=f"Predict on batch for {model_name} (batch of {batch_size})"):
			perf_callback.on_predict_begin()  # tf api is a joke why predict_on_batch has no callbacks i hate it here
			_ = model.predict_on_batch(sample)
			perf_callback.on_predict_end()

			telemetry['framework'].append("TF (predict_on_batch)")
			telemetry['model_name'].append(model_name)
			telemetry['rep'].append(i)
			telemetry['batch_size'].append(batch_size)
		telemetry['elapsed_time'].extend(perf_callback.latency_ref)
		
		# model(x) =======================================================================
		for i in trange(warmup_steps, desc=f"Warmup for {model_name} (batch of {batch_size})"):
			_ = model(sample, training=False)

		perf_callback.latency_ref.clear()
		for i in trange(repetitions, desc=f"__call__ for {model_name} (batch of {batch_size})"):
			perf_callback.on_predict_begin()
			_ = model(sample, training=False)
			perf_callback.on_predict_end()

			telemetry['framework'].append("TF (__call__)")
			telemetry['model_name'].append(model_name)
			telemetry['rep'].append(i)
			telemetry['batch_size'].append(batch_size)
		telemetry['elapsed_time'].extend(perf_callback.latency_ref)

		del model

2025-01-20 01:17:34.062456: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-20 01:17:34.063054: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-20 01:17:34.063657: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [4]:
results = pd.DataFrame(telemetry)

if second_run:
	first_part = pd.read_csv(f"../../results_ultimate_0/tensorflow-batch-size-comp-{INDEX}.csv")
	results = pd.concat([first_part, results])

results.to_csv(f"../../results_ultimate_0/tensorflow-batch-size-comp-{INDEX}.csv", index=False)

In [5]:
results = results.drop(["rep"], axis=1)
display(results.head())

results = results.groupby(["framework", "model_name", "batch_size"])
results.mean().head(15).reset_index()

Unnamed: 0,framework,model_name,batch_size,elapsed_time
0,TF (batch_size=None),FullyConnectedNet,1,13692269
1,TF (batch_size=None),FullyConnectedNet,1,14100517
2,TF (batch_size=None),FullyConnectedNet,1,13857866
3,TF (batch_size=None),FullyConnectedNet,1,13737609
4,TF (batch_size=None),FullyConnectedNet,1,13772256


Unnamed: 0,framework,model_name,batch_size,elapsed_time
0,TF (__call__),ConvNeXt-Tiny,1,104764200.0
1,TF (__call__),ConvNeXt-Tiny,16,102927200.0
2,TF (__call__),ConvNeXt-Tiny,32,103900800.0
3,TF (__call__),ConvNeXt-Tiny,64,103909500.0
4,TF (__call__),ConvNeXt-Tiny,96,118967300.0
5,TF (__call__),ConvNeXt-Tiny,128,104587200.0
6,TF (__call__),ConvNeXt-Tiny,192,104436900.0
7,TF (__call__),ConvNeXt-Tiny,256,103708800.0
8,TF (__call__),DenseNet-121,1,142417000.0
9,TF (__call__),DenseNet-121,16,142080400.0


In [7]:
model, ds, _ = env_builder(model_name, config)