# Data API

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
X = tf.range(10)
X

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [3]:
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [5]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


`dataset` can be **looped thru not only once**

In [7]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


> _The_ `from_tensor_slices()` _function takes a tensor and creates a_ `tf.data.Dataset` _whose elements are all the slices of_ `X` _(along the first dimension), so the above dataset contains_ `10` _items: tensors_ `0, 1, 2, ..., 9`.

Now let's choose an `X` of diff shape, and see exactly how `from_tensor_slices()` changes correspondingly.

```python
X = tf.range(10).reshape((2,5))
X
```

```
AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'reshape'
```

In [10]:
X = tf.reshape(tf.range(10), ((2,5)))
X

<tf.Tensor: shape=(2, 5), dtype=int32, numpy=
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]], dtype=int32)>

In this particular case, we believe that once applied `from_tensor_slices()`, we'd get two slices/items, each being a tensor of shape `(5,)` 

In [12]:
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (5,), types: tf.int32>

In [13]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int32)


Let's make a last test with `from_tensor_slices()` -- In the book, it says that we'd have obtained the same result as in our first test if we had used `dataset = tf.data.Dataset.range(10)` instead.

In [14]:
dataset = tf.data.Dataset.range(10)
dataset

<RangeDataset shapes: (), types: tf.int64>

In [15]:
type(dataset)

tensorflow.python.data.ops.dataset_ops.RangeDataset

Well, at least the type of `dataset` is diff from the above -- `RangeDataset` vs `TensorSliceDataset`.

In [16]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


## Chaining Transformations
`Dataset` object in `tf` has transformation methods. Each method returns a new dataset (much like in Functional programming each function should return a new value, instead of modifying its input arg in-place):

In [21]:
transformed_dataset1 = dataset.repeat(3)
transformed_dataset1

<RepeatDataset shapes: (), types: tf.int64>

In [22]:
for item in transformed_dataset1:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype

In [19]:
# Let's verify that dataset has not been modified
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


In [23]:
transformed_dataset2 = dataset.repeat(3).batch(7)
transformed_dataset2

<BatchDataset shapes: (None,), types: tf.int64>

In [24]:
for item in transformed_dataset2:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)
tf.Tensor([8 9], shape=(2,), dtype=int64)


**Rmk**. The shapes shown in the previous cell was `(None,)`, which is because there are batches
- of shape `(7,)`
- of shape `(2,)`

In [26]:
# If we want all batches have absolutely 7 elements
transformed_dataset2 = dataset.repeat(3).batch(7, drop_remainder=True)
transformed_dataset2

<BatchDataset shapes: (7,), types: tf.int64>

**Rmk**. We can see that the shape is now `(7,)`, no longer `(None,)`

In [27]:
for item in transformed_dataset2:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)


The textbook mentioned that `repeat()` method when used without specifying its input arg, it will **repeat indefinitely**. Let's verify that.

In [30]:
for i, item in enumerate(dataset.repeat()):
    print(i, end=" ")
    if i > 200:
        break

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 

Use the `map()` method to transform the items:

In [32]:
for item in dataset.map(lambda x: x**2):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(16, shape=(), dtype=int64)
tf.Tensor(25, shape=(), dtype=int64)
tf.Tensor(36, shape=(), dtype=int64)
tf.Tensor(49, shape=(), dtype=int64)
tf.Tensor(64, shape=(), dtype=int64)
tf.Tensor(81, shape=(), dtype=int64)


The `num_parallel_calls` arg can be convenient when the transformation applied is intensive (e.g. reshaping or rotating an image).

In [35]:
def fibo(n):
    if n <= 1:
        return 1
    else:
        return fibo(n-1) + fibo(n-2)

In [39]:
[fibo(i) for i in range(10)]

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55]

```python
for item in dataset.map(fibo, num_parallel_calls=4):
    print(item)
```

```
StagingError
    <ipython-input-34-cd1e91cf183e>:2 fibo  *
        if n <= 1:
    <ipython-input-34-cd1e91cf183e>:2 fibo  *
        if n <= 1:
    <ipython-input-34-cd1e91cf183e>:2 fibo  *
        if n <= 1:
    <ipython-input-34-cd1e91cf183e>:2 fibo  *
        if n <= 1:
    <ipython-input-34-cd1e91cf183e>:2 fibo  *
        if n <= 1:
        
    RecursionError: maximum recursion depth exceeded in comparison
```

In [44]:
import time
def fake_intensive(n):
    time.sleep(7)
    return n

In [50]:
%%time
for item in dataset.map(fake_intensive):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
CPU times: user 38 ms, sys: 387 Âµs, total: 38.4 ms
Wall time: 7.04 s


In [51]:
%%time
for item in dataset.map(fake_intensive, num_parallel_calls=8):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
CPU times: user 29.1 ms, sys: 0 ns, total: 29.1 ms
Wall time: 7.03 s


Well, I don't know of any example that can showcase the power of `num_parallel_calls`.

More methods
- `apply()`
- `filter()`
- `take()`

In [54]:
tf.data.Dataset.unbatch

<function tensorflow.python.data.ops.dataset_ops.DatasetV2.unbatch(self)>

In [55]:
tf.data.experimental.unbatch

<function tensorflow.python.data.experimental.ops.batching.unbatch()>

In [56]:
tf.data.Dataset.unbatch is tf.data.experimental.unbatch

False

In [58]:
for item in dataset.batch(7).apply(tf.data.Dataset.unbatch):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


In [60]:
for item in dataset.batch(7).apply(tf.data.experimental.unbatch()):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


In [61]:
tf.data.Dataset.unbatch is tf.data.experimental.unbatch()

False

In [63]:
for item in dataset.filter(lambda n: n % 2 == 0):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)


In [53]:
for item in dataset.repeat().take(7):
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
