Source
https://medium.com/ymedialabs-innovation/how-to-use-dataset-and-iterators-in-tensorflow-with-code-samples-3bb98b6b74ab

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np

TensorFlow 2.x selected.


### Dataset Creation

In [0]:
# from_tensor_slices accepts individual (or multiple) Numpy (or Tensor) object.

# 하나가 오거나 -> 원소 하나씩 뽑는 Dataset
# 튜플이 오거나 -> tuple 로 뽑는 Dataset. Zip처럼

# Assume batch isze is 1
dataset1 = tf.data.Dataset.from_tensor_slices(tf.range(10, 15))
# Emits data of 10, 11, 12, 13, 14, (one element at a time)

dataset2 = tf.data.Dataset.from_tensor_slices((tf.range(30, 45, 3), 
                                               np.arange(60, 70, 2)))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68)
# Emit one tuple at a time

try:
  dataset3 = tf.data.Dataset.from_tensor_slices(tf.range(10), np.arange(5))
  # Dataset no possible as zeroth dimension is different at 10 and 5
except:
  pass

In [0]:
# from_tensors
# 위에껀 batch가 하나라고 생각해도 되는데
# 이건 batching 안해줌. (안 잘라줌)
# 데이터가 한번에 다나옴
# 그래서 다른 차원의 여러개의 데이터를 한번에 줄 수 있음.

# 데이터가 작거나, 한번에 학습시켜야 할때 유용함.

dataset4 = tf.data.Dataset.from_tensors(tf.range(10, 15))
# Emits data of [10, 11, 12, 13, 14]
# Hold entire list as one element.

dataset5 = tf.data.Dataset.from_tensors((tf.range(30, 45, 3), 
                                         np.arange(60, 70, 2)))
# Emits data of ([30, 33, 36, 39, 42], [60, 62, 64, 66, 68])

dataset6 = tf.data.Dataset.from_tensors((tf.range(10), np.arange(5)))
# Possible with from_tensors, regardless of zeroth dimension mismatch of constituent elements.
# Emit data of ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4])
# Holds entire tuple as one element

In [0]:
# from_generators
# 런타임에 데이터를 만듬
# 데이터가 너무 커서 디스크에 안들어갈때.
# data augmentation 방법으로는 비추라고 함.

# Assume batch size is 1
def generator(sequence_type):
  if sequence_type == 1:
    for i in range(5):
      yield 10 + i
  elif sequence_type == 2:
    for i in range(5):
      yield (30 + 3 * i, 60 + 2 * i)
  elif sequence_type == 3:
    for i in range(1, 4):
      yield (i, ['Hi'] * i)

dataset7 = tf.data.Dataset.from_generator(generator, (tf.int32), args=([1]))
# Emits data of 10, 11, 12, 13, 14, (One element at a time)

dataset8 = tf.data.Dataset.from_generator(generator, (tf.int32, tf.int32), 
                                          args=([2]))
# Emits data of (30, 60), (33, 62), (36, 64), (39, 66), (42, 68), (One at a time)
# Emits one tuple at a time

dataset9 = tf.data.Dataset.from_generator(generator, (tf.int32, tf.int32), 
                                          args=([3]))
# Emits data of (1, ['Hi']), (2, ['Hi', 'Hi']), (3, ['Hi', 'Hi', 'Hi'])
# Emits one tuple at a tim

### Data Transformation
- **batch**: Sequentially divide dataset by the specific batch size.

01234567 -> batch(3) -> 012 345 67


- **repeat**: Whatever Dataset you generated, create duplicates of existing data in your Dataset

0123 -> repeat(2) -> 01230123

- **shuffle**: Randomly shuffle the data in Dataset.

01234567 -> shuffle(4) -> 30154276

- **map**: Apply some operation to all individual elements in the Dataset.
Particulary useful when applying data augmentation.

01234567 -> map(<<add1>>) -> 12345678

- **filter**: During course of training, filter our some elements from Dataset.

01234567 -> filter(pass only even) -> 0246




In [25]:
# Code to try out data present in datasets

dataset = dataset1   # Change to required dataset
iterator = iter(dataset)

for i in iterator:
  print(i)

tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(13, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)


In [0]:
dataset10 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Create a dataset with data of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset10 = dataset10.repeat(2)
# Duplicate the dataset
# Data will be [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset10 = dataset10.shuffle(5)
# Shuffle the dataset
# Assumed shuffling: [3, 0, 7, 9, 4, 2, 5, 0, 1, 7, 5, 9, 4, 6, 2, 8, 6, 8, 1, 3]

In [0]:
def map_fn(x):
  return x * 3

In [0]:
dataset10 = dataset10.map(map_fn)
# Same as dataset = dataset.map(lambda x: x + 3)
# Multiply each element with 3 using map transformation
# Dataset: [9, 0, 21, 27, 12, 6, 15, 0, 3, 21, 15, 27, 12, 18, 6, 24, 18, 24, 3, 9]

In [0]:
def filter_fn(x):
  return tf.reshape(tensor=tf.not_equal(x % 5, 1), shape=[]) # 스칼라로 해야지 이어붙지.

In [0]:
dataset10 = dataset10.filter(filter_fn)
# Same as dataset = dataset.filter(lambda x: tf.reshape(x % 5, 1), []))
# Filter out all those element whose modules 5 return 1
# Dataset: [9, 0, 27, 12, 15, 0, 3, 15, 27, 12, 18, 24, 18, 24, 3, 9]

In [0]:
dataset10 = dataset10.batch(4)
# Batch at every 4 elements
# Dataset: [9, 0, 27, 12], [15, 0, 3, 15], [27, 12, 18, 24], [18, 24, 3, 9]

In [33]:
iterator = iter(dataset10)
for i in iterator:
  print(i)

tf.Tensor([ 9 15  0 18], shape=(4,), dtype=int32)
tf.Tensor([12  0 24 27], shape=(4,), dtype=int32)
tf.Tensor([ 9 12  3 18], shape=(4,), dtype=int32)
tf.Tensor([ 3 15 24 27], shape=(4,), dtype=int32)


### Ordering of Dataset Transformations
Dataset may different ordered differently transformation.

In [0]:
# Ordering #1
dataset_1 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset_1 = dataset_1.batch(4)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]

dataset_1 = dataset_1.repeat(2)
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]
# Notice 2 elements batch in between

dataset_1 = dataset_1.shuffle(4)
# Shuffles at batch level.
# Dataset: [0, 1, 2, 3], [4, 5, 6, 7], [8, 9], [8, 9], [0, 1, 2, 3], [4, 5, 6, 7]

In [0]:
# Ordering #2
dataset_2 = tf.data.Dataset.from_tensor_slices(tf.range(10))
# Dataset: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

dataset_2 = dataset_2.shuffle(4)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset_2 = dataset_2.repeat(2)
# Dataset: [3, 1, 0, 4, 5, 8, 6, 9, 7, 2, 3, 1, 0, 4, 5, 8, 6, 9, 7, 2]

dataset_2 = dataset_2.batch(4)
# Dataset: [3, 1, 0, 4], [5, 8, 6, 9], [7, 2, 3, 1], [0, 4, 5, 8], [6, 9, 7, 2]


In [42]:
print('Order #1')

for i in iter(dataset_1):
  print(i)

Order #1
tf.Tensor([8 9], shape=(2,), dtype=int32)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)
tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
tf.Tensor([0 1 2 3], shape=(4,), dtype=int32)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int32)


In [43]:
print('Order #2')

for i in iter(dataset_2):
  print(i)

Order #2
tf.Tensor([3 2 5 6], shape=(4,), dtype=int32)
tf.Tensor([1 7 8 0], shape=(4,), dtype=int32)
tf.Tensor([9 4 2 4], shape=(4,), dtype=int32)
tf.Tensor([3 5 6 8], shape=(4,), dtype=int32)
tf.Tensor([9 7 1 0], shape=(4,), dtype=int32)
