#### Understading Tensorflow Pipeline - Prefetch and Caching Usage for Optimized Data Loading

In [12]:
import tensorflow as tf
import time

Prefetch

In [22]:
# Mock File Reading Function
class FileReader(tf.data.Dataset):
    def read_file_in_batches(count):
        for i in range(count):
            time.sleep(0.03)  # Simulate file read delay
            yield i
            
    def __new__(cls, count=6):
        return tf.data.Dataset.from_generator(
            cls.read_file_in_batches,
            output_signature=tf.TensorSpec(shape=(), dtype=tf.int32),
            args=(count,)
        )

In [23]:
# Benchmarking Function
def benchmark(dataset, epochs=10):
    for _ in range(epochs):
        for _ in dataset:
            time.sleep(0.01)  # Simulate training step delay

In [24]:
%%timeit
# Benchmarking without any optimizations
benchmark(FileReader(count=5))

1.92 s ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
# Benchmarking with Prefetch
benchmark(FileReader(count=5).prefetch(tf.data.AUTOTUNE))

1.93 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%%timeit
# Benchmarking with Prefetch
benchmark(FileReader(count=5).prefetch(4))

1.95 s ± 26.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Caching

In [49]:
dataset = tf.data.Dataset.range(6)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache("cached_data.tfdata") # caching in a file
print([x.item() for x in dataset.as_numpy_iterator()]) # print cached data

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [41]:
# delay function
def delay_process(x):
    tf.py_function(lambda: time.sleep(0.03), [], [])
    return x

In [43]:
%%timeit -r1 -n1
benchmark(FileReader(count=5).map(delay_process))

3.55 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [44]:
%%timeit -r1 -n1
benchmark(FileReader(count=5).map(delay_process).cache())

951 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
