# The tf.data.Dataset 
This API supports writing descriptive and efficient input pipelines. Dataset usage follows a common pattern:

In [30]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow.data import Dataset

## Source dataset : 
The simplest way to create a dataset is to create it from a python list:

In [2]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(9))
for i in dataset : 
    print(i)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)


## Methods

### range

In [31]:
list(Dataset.range(5).as_numpy_iterator())

list(Dataset.range(2, 5).as_numpy_iterator())

list(Dataset.range(1, 5, 2).as_numpy_iterator())

list(Dataset.range(1, 5, -2).as_numpy_iterator())

list(Dataset.range(5, 1).as_numpy_iterator())

list(Dataset.range(5, 1, -2).as_numpy_iterator())

list(Dataset.range(2, 5, output_type=tf.int32).as_numpy_iterator())

list(Dataset.range(1, 5, 2, output_type=tf.float32).as_numpy_iterator())


[1.0, 3.0]

### apply

Applies a transformation function to this dataset.
`apply` enables chaining of custom `Dataset` transformations, which are represented as functions that take one Dataset argument and return a transformed `Dataset`

In [3]:
dataset = tf.data.Dataset.range(100)
def dataset_fn(ds) : 
    return ds.filter(lambda x : x < 5 )
dataset = dataset.apply(dataset_fn)
list(dataset.as_numpy_iterator())

[0, 1, 2, 3, 4]

### as_numpy_iterator

Returns an iterator which converts all elements of the dataset to numpy.
Use `as_numpy_iterator` to inspect the content of your dataset. To see element shapes and types, print dataset elements directly instead of using `as_numpy_iterator`.

In [4]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(5))
# if we don't use as_numpu_iterator
for element in dataset:
    print(element)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)


In [5]:
# use iterator
for element in dataset.as_numpy_iterator() :
    print(element)

0
1
2
3
4


In [6]:
list(dataset.as_numpy_iterator())

[0, 1, 2, 3, 4]

`as_numpy_iterator()` will preserve the nested structure of dataset elements.

In [7]:
dataset = tf.data.Dataset.from_tensor_slices({"a" : ([1,2],[3,4]), "b": [5,6]})
list(dataset.as_numpy_iterator())

[{'a': (1, 3), 'b': 5}, {'a': (2, 4), 'b': 6}]

### batch
batch(batch_size, drop_remainder=False)

Combines consecutive elements of this dataset into batches

In [8]:
dataset = tf.data.Dataset.range(20)
dataset = dataset.batch(4, drop_remainder=True) 
list(dataset.as_numpy_iterator())

[array([0, 1, 2, 3]),
 array([4, 5, 6, 7]),
 array([ 8,  9, 10, 11]),
 array([12, 13, 14, 15]),
 array([16, 17, 18, 19])]

In [9]:
for element in dataset.as_numpy_iterator(): 
    print(element)

[0 1 2 3]
[4 5 6 7]
[ 8  9 10 11]
[12 13 14 15]
[16 17 18 19]


The components of the resulting element will have an additional outer dimension, which will be `batch_size` (or `N % batch_size` for the last element if batch_size does not divide the number of input elements `N` evenly and drop_remainder is `False`). If your program depends on the batches having the same outer dimension, you should set the drop_remainder argument to True to prevent the smaller batch from being produced.

### concatenate

Creates a Dataset by concatenating the given dataset with this dataset

In [10]:
a = tf.data.Dataset.range(1,5) # => [1,2,3,4]
b = tf.data.Dataset.range(4,8) # => [4,5,6,7]
ds = a.concatenate(b)
print(list(a.as_numpy_iterator()))
print(list(b.as_numpy_iterator()))
print(list(ds.as_numpy_iterator()))
# The input dataset and dataset to be concatenated should have the same
# nested structures and output types.
c = list(zip(a,b))
c = tf.data.Dataset.zip((a,b))
#a.concatenate(c) # Error

[1, 2, 3, 4]
[4, 5, 6, 7]
[1, 2, 3, 4, 4, 5, 6, 7]


### enumerate


In [11]:
dataset = tf.data.Dataset.range(10,15)
dataset = dataset.enumerate(start=2)
list(dataset.as_numpy_iterator())

[(2, 10), (3, 11), (4, 12), (5, 13), (6, 14)]

In [12]:
# The nested structure of the input dataset determines the structure of elements in the resulting dataset.
dataset = tf.data.Dataset.from_tensor_slices([(7, 8), (9, 10)])
dataset = dataset.enumerate()
list(dataset.as_numpy_iterator())

[(0, array([7, 8], dtype=int32)), (1, array([ 9, 10], dtype=int32))]

### filter

In [13]:
dataset = tf.data.Dataset.range(20)
dataset = dataset.filter(lambda x : x % 2 == 0)
list(dataset.as_numpy_iterator())

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

### flat_map

Maps `map_func` across this dataset and flattens the result.
Use `flat_map` if you want to make sure that the order of your dataset stays the same. For example, to flatten a dataset of batches into a dataset of their elements:

In [14]:
def flat_map_fn(x) :
    return tf.data.Dataset.from_tensor_slices(x)
np.random.seed(42)
dataset = tf.data.Dataset.from_tensor_slices(np.random.randint(1,10,(3,3)))
dataset = dataset.flat_map(flat_map_fn)
list(dataset.as_numpy_iterator())

[7, 4, 8, 5, 7, 3, 7, 8, 5]

### from_tensor_slices

Creates a `Dataset` whose elements are slices of the given tensors.

The given tensors are sliced along their first dimension. This operation preserves the structure of the input tensors, removing the first dimension of each tensor and using it as the dataset dimension. All input tensors must have the same size in their first dimensions.



In [15]:
# Slicing a 1D tensor produces scalar tensor elements.
dataset = tf.data.Dataset.from_tensor_slices([1,2,3,4])
list(dataset.as_numpy_iterator())

[1, 2, 3, 4]

In [16]:
# Slicing a 2D tensor produces 1D tensor elements.
dataset = tf.data.Dataset.from_tensor_slices([[1,2,3],[4,5,6]])
list(dataset.as_numpy_iterator())

[array([1, 2, 3], dtype=int32), array([4, 5, 6], dtype=int32)]

In [17]:
# Slicing a tuple of 1D tensors produces tuple elements containing scalar tensors.
dataset = tf.data.Dataset.from_tensor_slices(([1,2],[3,4],[5,6]))
list(dataset.as_numpy_iterator())

[(1, 3, 5), (2, 4, 6)]

In [18]:
# Dictionary structure is also preserved.
dataset = tf.data.Dataset.from_tensor_slices({"a" : [1,2], "b" : [3,4]})
list(dataset.as_numpy_iterator())

[{'a': 1, 'b': 3}, {'a': 2, 'b': 4}]

In [19]:
# Two tensors can be combined into one Dataset object.
features = tf.constant([[1,3], [2,1],[3,3]]) # ==> 3x2 tensor
print(features)
labels = tf.constant(["A","B","A"]) # ==> 3x1 tensor
print(labels)
dataset = tf.data.Dataset.from_tensor_slices((features,labels))
list(dataset.as_numpy_iterator())
# Both the features and the labels tensors can be converted
# to a Dataset object separately and combined after.
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset= tf.data.Dataset.from_tensor_slices(labels)
dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))
list(dataset.as_numpy_iterator())
# A batched feature and label set can be converted to a Dataset
batched_features = tf.constant([[[1, 3], [2, 3]],
                                [[2, 1], [1, 2]],
                                [[3, 3], [3, 2]]], shape=(3,2,2) )
batched_labels = tf.constant([['A', 'A'],
                              ['B', 'B'],
                              ['A', 'B']], shape=(3,2,1))
dataset = tf.data.Dataset.from_tensor_slices((batched_features,batched_labels))
for element in dataset.as_numpy_iterator():
      print(element)

tf.Tensor(
[[1 3]
 [2 1]
 [3 3]], shape=(3, 2), dtype=int32)
tf.Tensor([b'A' b'B' b'A'], shape=(3,), dtype=string)
(array([[1, 3],
       [2, 3]], dtype=int32), array([[b'A'],
       [b'A']], dtype=object))
(array([[2, 1],
       [1, 2]], dtype=int32), array([[b'B'],
       [b'B']], dtype=object))
(array([[3, 3],
       [3, 2]], dtype=int32), array([[b'A'],
       [b'B']], dtype=object))


### map

`Maps` map_func across the elements of this dataset.

This transformation applies `map_func` to each element of this dataset, and returns a new dataset containing the transformed elements, in the same order as they appeared in the input. map_func can be used to change both the values and the structure of a dataset's elements. For example, adding 1 to each element, or projecting a subset of element components.

In [22]:
dataset = tf.data.Dataset.range(1,7)
dataset = dataset.map(lambda x : x * 5 )
list(dataset.as_numpy_iterator())

[5, 10, 15, 20, 25, 30]

The input signature of `map_func` is determined by the structure of each element in this dataset.

In [29]:
# Each element is a tuple containing two `tf.Tensor` objects.
elements = [(1, "foo"), (2, "bar"), (3, "baz")]
dataset = tf.data.Dataset.from_generator(lambda : elements, (tf.int32, tf.string))
print(list(dataset.as_numpy_iterator()))
# `map_func` takes two arguments of type `tf.Tensor`. This function
# projects out just the first component
result = dataset.map(lambda x_int, y_int : x_int)
list(result.as_numpy_iterator())

[(1, b'foo'), (2, b'bar'), (3, b'baz')]


[1, 2, 3]

### reduce

`reduce(initial_state, reduce_func)`

Reduces the input dataset to a single element

The transformation calls `reduce_func` successively on every element of the input dataset until the dataset is exhausted, aggregating information in its internal state. The `initial_state` argument is used for the initial state and the final state is returned as the result.

print(tf.data.Dataset.range(1,6).reduce(np.int64(0), lambda x, _ : x + 1 ).numpy())
print(tf.data.Dataset.range(1,6).reduce(np.int64(0), lambda x, y : x + y).numpy())

### repeat

`repeat(count=None)`

Repeats this dataset so each original value is seen count times.

In [46]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
dataset = dataset.repeat(count=3)
list(dataset.as_numpy_iterator())

[1, 2, 3, 1, 2, 3, 1, 2, 3]

### shuffle

`shuffle(buffer_size, seed=None, reshuffle_each_iteration=None)`

Randomly shuffles the elements of this dataset.

This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.

For instance, if your dataset contains 10,000 elements but buffer_size is set to 1,000, then shuffle will initially select a random element from only the first 1,000 elements in the buffer. Once an element is selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, maintaining the 1,000 element buffer.

reshuffle_each_iteration controls whether the shuffle order should be different for each epoch. In TF 1.X, the idiomatic way to create epochs was through the repeat transformation:

In [61]:
dataset = tf.data.Dataset.range(6)
dataset = dataset.shuffle(5,reshuffle_each_iteration=True)
list(dataset.as_numpy_iterator())

[2, 3, 0, 5, 1, 4]

### skip 

`skip(count)`

Creates a `Dataset` that skips `count` elements from this dataset.

In [74]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.skip(3)
list(dataset.as_numpy_iterator())


[3, 4, 5, 6, 7, 8, 9]

### take

`take(count)`

Creates a Dataset with at most `count` elements from this dataset.

In [77]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.take(3)
list(dataset.as_numpy_iterator())

[0, 1, 2]

### unbatch

In [89]:
np.random.seed(42)
dataset = tf.data.Dataset.range(10)
dataset = dataset.batch(batch_size=3)
print(list(dataset.as_numpy_iterator()))
dataset = dataset.unbatch()
print(list(dataset.as_numpy_iterator()))

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8]), array([9])]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


### zip
`@staticmethod
zip(datasets)`

Creates a `Dataset` by zipping together the given datasets.

This method has similar semantics to the built-in `zip()` function in Python, with the main difference being that the `datasets` argument can be an arbitrary nested structure of `Dataset` objects.

In [105]:
# The nested structure of the `datasets` argument determines the
# structure of elements in the resulting dataset.
a = tf.data.Dataset.range(1,4)
b = tf.data.Dataset.range(4,7)
ds = tf.data.Dataset.zip((a,b))
print(list(ds.as_numpy_iterator()))
ds = tf.data.Dataset.zip((b,a))
print(list(ds.as_numpy_iterator()))

# The `datasets` argument may contain an arbitrary number of datasets.
c = tf.data.Dataset.range(7, 13).batch(2) 
print(list(c.as_numpy_iterator()))
ds = tf.data.Dataset.zip((a,b,c))
for ele in ds.as_numpy_iterator() : 
    print(ele)
    
# The number of elements in the resulting dataset is the same as the size of the smallest dataset in `datasets`
d = tf.data.Dataset.range(13, 15)
ds = df.ta

[(1, 4), (2, 5), (3, 6)]
[(4, 1), (5, 2), (6, 3)]
[array([7, 8]), array([ 9, 10]), array([11, 12])]
(1, 4, array([7, 8]))
(2, 5, array([ 9, 10]))
(3, 6, array([11, 12]))
