#### When loading the data from a local storage or from memory, the GPU can starve for data due to the potential inefficiencies manual data loaders. The tf.data API is designed to address this issue. 

#### A typical manual data loader is as follows (local storage)

In [None]:
# Class method
class DataLoader:
  def __init__(self):
    # Initialise the loader with the file paths (inputs, if lables)
    # If not label file for each input, load the label text and make a
    # input (file name): label mapping dict
    # Initialise the batch counter
    # Specify the class balancing logics if necessary
    pass
  def call(self):
    # Load a batch of input (filename): label 
    # Open the and batch the input files along with the labels
    # increament the batch counter, so next call loads the next batch
    # return the data batch
    pass

In [None]:
# Generator method
def dataloader():
  # Initialise the loader with the file paths (inputs, if lables)
  # If not label file for each input, load the label text and make a
  # input (file name): label mapping dict
  # Initialise the batch counter
  # Specify the class balancing logics if necessary
  def datagenerator():
    # yield each batch given the batch size and increament the batch counter
    databatch = None
    yield databatch
  
  return datagenerator

#### However the tf.data API allows one to build more complex data pipelines using reusable simple set of APIs

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
# Simple dataset (memory)
simple_dataset_1 = tf.data.Dataset.from_tensor_slices([5, 3, 2, 1, 6, 8])
for i in simple_dataset_1:
  print(i)

tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [None]:
# Simple dataset with label association
simple_dataset_2 = tf.data.Dataset.from_tensor_slices(([5, 3, 2, 1, 6, 8], ["a", "b", "c", "d", "e", "f"]))
for i, j in simple_dataset_2:
  print("input: {}, label: {}".format(i, j))

input: 5, label: b'a'
input: 3, label: b'b'
input: 2, label: b'c'
input: 1, label: b'd'
input: 6, label: b'e'
input: 8, label: b'f'


#### Additionally, typical python like functional methods are available to perform transformations such as map, reduce and filter

In [None]:
# map to transform element wise
simple_dataset_map = simple_dataset_2.map(lambda x, y: (x**2, y))
for i, j in simple_dataset_map:
  print("input: {}, label: {}".format(i, j))

input: 25, label: b'a'
input: 9, label: b'b'
input: 4, label: b'c'
input: 1, label: b'd'
input: 36, label: b'e'
input: 64, label: b'f'


In [None]:
# Reduce a tensor to a single value
simple_dataset_reduce = simple_dataset_1.reduce(np.int32(0), lambda x,y: x+y).numpy()
print(simple_dataset_reduce)

25


In [None]:
# Filter unnecessary values
'''
def filter_fn(ds):
  return ds.filter(lambda x: x < 3)

simple_dataset_filter = simple_dataset_2.apply(filter_fn)
'''
simple_dataset_filter = simple_dataset_2.filter(lambda x, y: x>=3)
for i, j in simple_dataset_filter:
  print("input: {}, label: {}".format(i, j))

input: 5, label: b'a'
input: 3, label: b'b'
input: 6, label: b'e'
input: 8, label: b'f'


#### A python generator can be converted into a tf dataset

In [None]:
def get_generator():
  data = tf.random.uniform((1000, 1))
  def simple_generator(stop):
    i = 0
    while i < stop:
      elem = data[i]
      i += 1
      yield elem
    
  return simple_generator

In [None]:
# Python approach
my_generator = get_generator()
for i in my_generator(1000):
  print(i)

tf.Tensor([0.16083467], shape=(1,), dtype=float32)
tf.Tensor([0.4409504], shape=(1,), dtype=float32)
tf.Tensor([0.28390074], shape=(1,), dtype=float32)
tf.Tensor([0.31091225], shape=(1,), dtype=float32)
tf.Tensor([0.6245127], shape=(1,), dtype=float32)
tf.Tensor([0.1452483], shape=(1,), dtype=float32)
tf.Tensor([0.81280386], shape=(1,), dtype=float32)
tf.Tensor([0.7549505], shape=(1,), dtype=float32)
tf.Tensor([0.87484515], shape=(1,), dtype=float32)
tf.Tensor([0.32321012], shape=(1,), dtype=float32)
tf.Tensor([0.172256], shape=(1,), dtype=float32)
tf.Tensor([0.52800465], shape=(1,), dtype=float32)
tf.Tensor([0.16079545], shape=(1,), dtype=float32)
tf.Tensor([0.17790401], shape=(1,), dtype=float32)
tf.Tensor([0.64372504], shape=(1,), dtype=float32)
tf.Tensor([0.6612303], shape=(1,), dtype=float32)
tf.Tensor([0.351529], shape=(1,), dtype=float32)
tf.Tensor([0.64212215], shape=(1,), dtype=float32)
tf.Tensor([0.8795152], shape=(1,), dtype=float32)
tf.Tensor([0.37303734], shape=(1,), dtype

In [None]:
# TF data approach (with batching)
generator_dataset = tf.data.Dataset.from_generator(my_generator, args=[1000], output_types=tf.float32, output_shapes=(1))
for data in generator_dataset.batch(10):
  print(data)

tf.Tensor(
[[0.16083467]
 [0.4409504 ]
 [0.28390074]
 [0.31091225]
 [0.6245127 ]
 [0.1452483 ]
 [0.81280386]
 [0.7549505 ]
 [0.87484515]
 [0.32321012]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[0.172256  ]
 [0.52800465]
 [0.16079545]
 [0.17790401]
 [0.64372504]
 [0.6612303 ]
 [0.351529  ]
 [0.64212215]
 [0.8795152 ]
 [0.37303734]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[0.65413797]
 [0.09908831]
 [0.32144654]
 [0.68531466]
 [0.28770924]
 [0.8808857 ]
 [0.91681015]
 [0.2883761 ]
 [0.82795703]
 [0.60646725]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[0.33460736]
 [0.96000373]
 [0.46340024]
 [0.9240588 ]
 [0.2875607 ]
 [0.80893123]
 [0.06501162]
 [0.06923485]
 [0.5056788 ]
 [0.05903137]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[0.99903095]
 [0.02457964]
 [0.4107095 ]
 [0.45208228]
 [0.8538363 ]
 [0.53129196]
 [0.61314106]
 [0.50105524]
 [0.58193994]
 [0.17133975]], shape=(10, 1), dtype=float32)
tf.Tensor(
[[0.30836904]
 [0.386837  ]
 [0.51339734]
 [0.5141263 ]
 [0.14858496]
 [0.

#### When datasets are very large (the typical case), they might not fit into any memory (CPU or GPU). To load the large datasets, TFRecord files can be used.