In [0]:
%tensorflow_version 2.x

In [0]:
import tensorflow as tf
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(precision=4)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
dataset

In [0]:
for elem in dataset:
  print(elem.numpy())

In [0]:
# Dataset object is Python iterable
# Explicitly creating a Python iterator using `iter`
# and comsuming its elements using `next`
it = iter(dataset)
print(next(it).numpy())

In [0]:
print(next(it).numpy())

In [0]:
# Reduce transformation, which reduces all elements to produce a single result.
print(dataset.reduce(0, lambda state, value: state + value).numpy())

### Dataset structure
A dataset contains elements that each have the same (nested) structure and the individual components of the structure can be of any type representable by `tf.TypeSpec`, including `Tensor`, `SparseTensor`, `RaggedTensor`, `TensorArray`, or `Dataset`

The `Dataset.element_spec` property allows you to inspect the type of each element component. The property returns a nested structure of `tf.TypeSpec` object, matching the structure of the element, which may . be a single component, a tuple of components, or a nested tuple of components. 

In [0]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))
dataset1.element_spec

In [0]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform(shape=[4]),
     tf.random.uniform(shape=[4, 100], maxval=100, dtype=tf.int32)))

dataset2.element_spec

In [0]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

dataset3.element_spec

In [0]:
# Dataset containing a sparse tensor
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))

dataset4.element_spec

In [0]:
# Use value_type to see the type of value represented by the element spec
dataset4.element_spec.value_type

The `Dataset` transformations support datasets of any structure. When using the `Dataset.map()`, and `Dataset.filter()` transformations, which apply a function to each element, the element structure determines the arguments of the funciton:

In [0]:
dataset1 = tf.data.Dataset.from_tensor_slices(
    tf.random.uniform([4, 10], minval=1, maxval=10, dtype=tf.int32))

dataset1

In [0]:
next(iter(dataset1))

In [0]:
for z in dataset1:
  print(z.numpy())

In [0]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
     tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))

dataset2

In [0]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

dataset3

In [0]:
for a, (b, c) in dataset3:
  print('shape: {a.shape}, {b.shape}, {c.shape}'.format(a=a, b=b, c=c))

## Reading input data
### Comsuming NumPy arrays
See Loading NumPy arrays for more examples.
If all of your input data fits in memory, the simplest way to create a `Dataset` from them is to convert them ti `tf.Tensor` objects and use `Dataset.from_tensor_slices()`.

In [0]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

In [0]:
images, labels = train
images = images / 255.

dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset

The above code snippet will embed the features and labels arrays in your TensorFlow graph as `tf.constant()` operations. This works well for small dataset, but wastes memory--because the contents of the array will be copied multiple times--and can run into the the 2GB limit for the `tf.GraphDef` protocol buffer.

### Consuming Python generators
Another common data source that can easily be ingested as a `tf.data.Dataset` is the python generator.
Caution: While this is a convinient apporach it has limited portability and scalibility. It mus run in the same python process that created the generator, and is sill subject to the Python GIL.

In [0]:
def count(stop):
  i = 0
  while i < stop:
    yield i
    i += 1

In [0]:
for n in count(5):
  print(n)

The `Dataset.from_generator` constructor converts the python generator to a fully functional `tf.data.Dataset`.

The constructor takes a callable as input, not an iterator. This allows it to restart the generator when it reaches the end. It takes an optional `args` argument, which is passed as the callable's arguments.

The `output_types` argument is required because `tf.data` builds a `tf.Graph` internally, and graph edges require a `tf.dtype`.

In [0]:
ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes=(),)

In [0]:
for count_batch in ds_counter.repeat().batch(10).take(10):
  print(count_batch.numpy())

In [0]:
def gen_series():
  i = 0
  while True:
    size = np.random.randint(0, 10)
    yield i, np.random.normal(size=(size,))
    i += 1

In [0]:
for i, series in gen_series():
  print(i, ':', str(series))
  if i > 5:
    break

In [0]:
# The first output is an `tf.int32` the second is a float32
# The first item is a scalar, shape (), and the second is a vector of unknown length,
# (None, )
ds_series = tf.data.Dataset.from_generator(
    gen_series,
    output_types=(tf.int32, tf.float32),
    output_shapes=((), (None, )))

ds_series

In [0]:
# Now is can be used like a regular `tf.data.Dataset`.
# Note that when batching a dataset with a variable shape,
# you need to use `Dataset.padded_batch`.

ds_series_batch = ds_series.shuffle(20).padded_batch(10, padded_shapes=([], [None]))

ids, sequence_batch = next(iter(ds_series_batch))
print(ids.numpy())
print()
print(sequence_batch.numpy())

In [0]:
flowers = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

In [0]:
# Create the `image.ImageDataGenerator`
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)

In [0]:
images, labels = next(img_gen.flow_from_directory(flowers))

In [0]:
print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

In [0]:
ds = tf.data.Dataset.from_generator(
    img_gen.flow_from_directory,
    args=[flowers],
    output_types=(tf.float32, tf.float32),
    output_shapes=([32, 256, 256, 3], [32, 5]))

ds

### Consuming TFRecord data

TFRecord file format is a simple record-oriented binary format that many TensorFlow applications use for training data. The `tf.data.TFRecordDataset` class enables you to stream over the contents of one or more TFRecord files as part of an input pipeline.

In [0]:
# Creates a dataset that reads all of the examples from two files.
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

The `filenames` argument to the `TFRecordDataset` initializer can be either be string, or a `tf.Tensor` of strings. Therefore if you have two sets of files for training and validation purposes, you can create a factory method that produces the dataset, taking filenames as an input argument:

In [0]:
dataset = tf.data.TFRecordDataset(filenames = [fsns_test_file])
dataset

Many TensorFlow projects use serialized `tf.train.Example` records in their TFRecord files. These need to be decoded before they can be inspected:

In [0]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())

parsed.features.feature['image/text']

In [0]:
# tf.data.TextLineDataset extract lines from one or more text files.
# TextLineDataset will produce one string-valued element per line of those files.

directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_paths = [
    tf.keras.utils.get_file(file_name, directory_url + file_name)
    for file_name in file_names]

In [0]:
dataset = tf.data.TextLineDataset(file_paths)

In [0]:
# Few lines of the first file:
for line in dataset.take(5):
  print(line.numpy())

In [0]:
# To alternate lines between files use `Dataset.interleave`.
# This makes it easier to shuffle files together.
file_ds = tf.data.Dataset.from_tensor_slices(file_paths)
lines_ds = file_ds.interleave(tf.data.TextLineDataset, cycle_length=3)

for i, line in enumerate(lines_ds.take(9)):
  if i % 3 == 0:
    print()
  print(line.numpy())

By default, a `TextLineDataset` yield every line of each file, which may not be desirable, for example, if the file starts with a header line, or contains comments. These lines can be removed using the `Dataset.skip()` or `Dataset.filter()` transformation. Here we skip the first line, then filter to fine only survivors.

In [0]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

In [0]:
for line in titanic_lines.take(10):
  print(line.numpy())

In [0]:
def survived(line):
  return tf.not_equal(tf.strings.substr(line, 0, 1), '0')

survivors = titanic_lines.skip(1).filter(survived) 

In [0]:
for line in survivors.take(10):
  print(line.numpy())