In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(precision=4)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
dataset

In [0]:
for elem in dataset:
  print(elem.numpy())

In [0]:
it = iter(dataset)
print(next(it).numpy())

In [0]:
dataset.reduce(0, lambda state, value: state + value).numpy()

#Dataset structure
A dataset contains elements that each have the same (nested) structure and the individual components of the structure can be of any type representable by `tf.TypeSpec`, including `Tensor`, `SparseTensor`, `RaggedTensor`, `TensorArray`, or `Dataset`.

The `Dataset.element_spec` property allows you to inspect the type of each element component. The property returns a nested structure of `tf.TypeSpec` object, matching the structure of the element, which may be a single component, a tuple of components, or a nested tuple of components.

In [0]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))
dataset1.element_spec

In [0]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
     tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))
dataset2.element_spec

In [0]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
dataset3.element_spec

In [0]:
# Dataset containing a sparse tensor.
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]))
dataset4.element_spec

In [0]:
# Use value_type to see the type of value represented by the element spec
dataset4.element_spec.value_type
# element_spec 은 tf.TypeSpec 객체를 반환하니까, 값의 타입은 거기에 value_type을 붙여준다.

In [0]:
dataset1 = tf.data.Dataset.from_tensor_slices(
    tf.random.uniform([4, 10], minval=1, maxval=10, dtype=tf.int32))

dataset1 # elem의 shape을 준다.

In [0]:
for z in dataset1:
  print(z.numpy())

In [0]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
     tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))
dataset2
# Dataset의 element의 type(structure)은 똑같고
# element 원소의 의 type은 똑같다.

In [0]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
dataset3

In [0]:
for a, (b, c) in dataset3:
  print('shapes: {a.shape}, {b.shape}, {c.shape}'.format(a=a, b=b, c=c))

In [0]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

In [0]:
images, labels = train
images = images / 255.0

dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset

## Note
The above code snippet will embed the features and laels array in your TensorFlow graph as tf.constant() operations. This works well for a small dataset, but wastes memory -- because the contents of the array will be copied multiple times -- and can run into the 2GB limit for the tf.GraphDef protocol buffer.

### Consuming Python generators.
Another common data source that can easily be ingested as a `tf.data.Dataset` is the Python generator.

Caution: While this is a convinient approach it has limited portability and scalibility. It must run in the same python process that created the generator, and is still subject to the Python GIL.

In [0]:
def count(stop):
  i = 0
  while i < stop:
    yield i
    i += 1

In [0]:
for n in count(5): # Generator functions behaves like an iterator.
  print(n)

The `Dataset.from_generator` constructor converts the python generator to a fully functional `tf.data.Dataset`.

The constructor takes a callable as input, not an iterator. This allows it to restart the generator when it reaches the end. It takes an optional `args` argument, which is passed as the callable's arguments.

The `output_types` argument is required because `tf.data` builds a `tf.Graph` internally, and graph edges require a `tf.dtype`.

In [0]:
ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes=())

In [0]:
for count_batch in ds_counter.repeat().batch(10).take(10):
  print(count_batch.numpy())

The `output_shapes` argument is not required but is highly recommended as many tensorflow operations do not support tensors with unknown rank. If the length of a particular axis is unknown or variable, set it as `None` in the `output_shapes`.


It's also important to note that the `output_shapes` and `output_types` follow the same nesting rules as other dataset methods.


In [0]:
# Here is an example generator that demonstrates both aspects,
# it returns tuple of arrays, where the second array is a vector with unknown length.
def gen_series():
  i = 0
  while True:
    size = np.random.randint(0, 10)
    yield i, np.random.normal(size=(size, ))
    i += 1

In [0]:
for i, series in gen_series():
  print(i, ':', str(series))
  if i > 5:
    break

In [0]:
ds_series = tf.data.Dataset.from_generator(gen_series, 
                                 output_types=(tf.int32, tf.float32), 
                                 output_shapes=((), (None, )))
ds_series

In [0]:
# Now it can be used like a regular `tf.data.Dataset`.
# Note that when batching a dataset with a variable shape,
# you need to use `Dataset.padded_batch`.

ds_series_batch = ds_series.shuffle(20).padded_batch(10, padded_shapes=((), (None, )))

ids, sequence_batch = next(iter(ds_series_batch))
print(ids.numpy())
print()
print(sequence_batch.numpy())

In [0]:
# For a more realistic example, try wrapping `preprocessing.image.ImageDataGenerator`
# as a `tf.data.Dataset`.

# First download the data.
flowers = tf.keras.utils.get_file(
  'flower_photos',
  'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
  untar=True)

In [0]:
flowers

In [0]:
# Create the `image.ImageDataGenerator`
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)

In [0]:
images, labels = next(img_gen.flow_from_directory(flowers))
# args are files.
# image_gen.flow_from_directory is generator because it returns iterator.

In [0]:
print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

In [0]:
ds = tf.data.Dataset.from_generator(img_gen.flow_from_directory, 
                                    args=[flowers], 
                                    output_types=(tf.float32, tf.float32), 
                                    output_shapes=((32, 256, 256, 3), (32, 5)))
ds

In [0]:
# Creates a dataset that reads all of examples from two files.
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

In [0]:
dataset = tf.data.TFRecordDataset(filenames=[fsns_test_file])
dataset

Many TensorFlow projects use serialized `tf.train.Example` records in their TFRecord files. These need to be decoded before they can be inspected.

In [0]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())

parsed.features.feature['image/text']

The `tf.data.TextLineDataset` provides an easy way to extract lines from one or more text files.

In [0]:
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_paths = [tf.keras.utils.get_file(file_name, directory_url + file_name)
  for file_name in file_names
]

In [0]:
dataset = tf.data.TextLineDataset(file_paths) # 파일을 하나로 뭉친것처럼.

In [0]:
for line in dataset.take(5):
  print(line.numpy())

To alternate lines between files use `Dataset.interleave`. This makes it easier to shuffle files together.

In [0]:
files_ds = tf.data.Dataset.from_tensor_slices(file_paths) # 파일 여러개, 배열.
lines_ds = files_ds.interleave(tf.data.TextLineDataset, cycle_length=3) # interleave.

for i, line in enumerate(lines_ds.take(9)):
  if i % 3 == 0:
    print()
  print(line.numpy())

In [0]:
next(iter(files_ds))

By default, a `TextLineDataset` yields every line of each file, which may not be desirable, for example, if the file starts with a header line, or contains comments. These lines can be removed using the `Dataset.skip()` or `Dataset.filter()` transformations. Here we skip the first line, then filter to fine only survivors.

In [0]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

In [0]:
for line in titanic_lines.take(10):
  print(line.numpy())

In [0]:
def survived(line):
  return tf.not_equal(tf.strings.substr(line, 0, 1), '0')

survivors = titanic_lines.skip(1).filter(survived)

In [0]:
for line in survivors.take(10):
  print(line.numpy())

In [0]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")

In [0]:
df = pd.read_csv(titanic_file, index_col=None)
df.head()

If your data fits in memory the same `Dataset.from_tensor_slices` method works on dictionaries, allowing this data to be easily imported.

In [0]:
titanic_slices = tf.data.Dataset.from_tensor_slices(dict(df))

for feature_batch in titanic_slices.take(1):
  for key, value in feature_batch.items():
    print('  {!r:20s}: {}'.format(key, value))

In [0]:
titanic_batches = tf.data.experimental.make_csv_dataset(titanic_file, batch_size=4, label_name='survived')

In [0]:
for feature_batch, label_batch in titanic_batches.take(1):
  print('Survived: {}'.format(label_batch))
  print('features:')
  for key, value in feature_batch.items():
    print('   {!r:20s}: {}'.format(key, value))

In [0]:
# You can use `select_columns` argument if you only need a subset of columns.
titanic_batches = tf.data.experimental.make_csv_dataset(
    titanic_file, batch_size=4,
    label_name='survived', select_columns=['class', 'fare', 'survived']
)

In [0]:
for feature_batch, label_batch in titanic_batches.take(1):
  print("'survived': {}".format(label_batch))
  for key, value in feature_batch.items():
    print("  {!r:20s}: {}".format(key, value))

In [0]:
# Consuming sets of files.
flowers_root = tf.keras.utils.get_file(
    'flower_photos',
     'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
     untar=True
)
flowers_root= pathlib.Path(flowers_root) # pathlib.Path로 감싸줘서 glob쓴다.
# Gives semantic file path for different OS

In [0]:
flowers_root

1. utils.get_file로 파일 다운로드, 디렉토리 경로 잡고,
2. pathlib.Path 로 general 한 file path 만들고, 
3. Dataset.list_files로 파일 리스트로 잡고
4. map으로 전처리

In [0]:
# The root directory contains a directory for each class:
for item in flowers_root.glob('*'):
  print(item)
  print(item.name) # Glob에 name하면 파일(디렉터리) 이름 나온다.

In [0]:
# The files in each class directory are example.
list_ds = tf.data.Dataset.list_files(str(flowers_root/'*/*' ))

for f in list_ds.take(5):
  print(f.numpy())

In [0]:
# We can read the data using the `tf.io.read_file` function and extract
# the label from the path, returning (image, label) pairs.
def process_path(file_path): # Dataset이 되는구나.
  label = tf.strings.split(file_path, '/')[-2]
  return tf.io.read_file(file_path), label

labeled_ds = list_ds.map(process_path)

In [0]:
for image_raw, label_text in labeled_ds.take(1):
  print(repr(image_raw.numpy()[:100]))
  print()
  print(label_text.numpy())


Batching dataset elements

In [0]:
# Batching does with same constraints as tf.stack()
# all elements must have a tensor of the exact same shape.
inc_dataset = tf.data.Dataset.range(100)
dec_dataset = tf.data.Dataset.range(0, -100, -1)
dataset = tf.data.Dataset.zip((inc_dataset, dec_dataset))
print(dataset.element_spec)
batched_dataset = dataset.batch(4)
print(batched_dataset.element_spec) # Batch하면 element_spec 다르다. (당연하지 한번에 여러개가 나오는데.)

for batch in batched_dataset.take(4):
  print([arr.numpy() for arr in batch])

While `tf.data` tries to propagate shape information, the default settings of `Dataset.batch` result in an unknown batch size because the last batch may not be full.

In [0]:
# Note the `None`s in the shape
batched_dataset

Use the `drop_remainder` argument to ignore the last batch, and get full shape propagation:

In [0]:
batched_dataset = dataset.batch(7, drop_remainder=True)
batched_dataset # 여기서  shape는 element의 shape이다.

### Batching tensors with padding
The above works for tensors that all have the same size.

In [0]:
dataset = tf.data.Dataset.range(100)
dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
dataset = dataset.padded_batch(4, padded_shapes=(None,))

for batch in dataset.take(2):
  print(batch.numpy())
  print()

## Training workflows

Iterate over a dataset in multiple epochs using the `Dataset.repeat()`

In [0]:
titanic_file = tf.keras.utils.get_file('train.csv', "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

In [0]:
def plot_batch_size(ds):
  batch_sizes = [batch.shape[0] for batch in ds]
  plt.bar(range(len(batch_sizes)), batch_sizes)
  plt.xlabel('Batch number')
  plt.ylabel('Batch size')

The `Dataset.repeat` transformation concatenates its arguments without signaling the end of one epoch and the beginning of the next epoch. Because of this a `Dataset.batch` applied after `Dataset.repeat` will yield batches that straddle epoch boundaries:

In [0]:
# Applying the `Dataset.repeat()` transformation with no arguments will repeat the
# input indefinitely.
titanic_batches = titanic_lines.repeat(3).batch(128)
plot_batch_size(titanic_batches)

In [0]:
# If you need clear epoch separation,
# put `Dataset.batch` before repeat:
titanic_batches = titanic_lines.batch(128).repeat(3)
plot_batch_size(titanic_batches)

If you would like to perform a custom computation (e.g. to collect statistics) at the end of each epoch then it's simplest to restart the dataset iteration on ecah epoch:

In [0]:
epochs = 3
dataset = titanic_lines.batch(128)

for epoch in range(epochs):
  for batch in dataset:
    print(batch.shape)
    print(batch[0]) # 128개의 line tensor
  print('End of epoch: ', epoch)

### Randomly shuffling input data
The `Dataset.shuffle()` transformation maintains a fixed_size buffer and choose the next element uniformly at random from that buffer.

While large buffer_size shuffle more thorougly, they can take a lot of memory, and significant time to fill. Consider using Dataset.interleave across files if this becomes a problem.

Dataset.interleave는 여러 파일이라서 shuffle한것처럼 됨.

In [0]:
# Add an index to the dataset so you can see the effect:
lines = tf.data.TextLineDataset(titanic_file)
counter = tf.data.experimental.Counter()

dataset = tf.data.Dataset.zip((counter, lines))
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(20, drop_remainder=True)
dataset

Since the `buffer_size` is 100, and the batch size is 20, the first batch contains no elements with an index over 120.

In [0]:
n, line_batch = next(iter(dataset))
print(n.numpy())

As with `Dataset.batch` the order relative to `Dataset.repeat` matters.

`Dataset.shuffle` doesn't signal the end of an epoch until the shuffle buffer is empty. So a shuffle placed before a repeat will show every element of one epoch before moving to the next.

In [0]:
dataset = tf.data.Dataset.zip((counter, lines))
shuffled = dataset.shuffle(buffer_size=100).batch(10).repeat(2)

print('Here are the item IDs near the epoch boundary:\n')
for n, line_batch in shuffled.skip(60).take(5):
  print(n.numpy())

In [0]:
shuffle_repeat = [n.numpy().mean() for n, line_batch in shuffled]
plt.plot(shuffle_repeat, label='shuffle().repeat()')
plt.ylabel('Mean item ID')
plt.legend()

In [0]:
# But a repeat before a shuffle mixes the epoch boundaries together:
dataset = tf.data.Dataset.zip((counter, lines))
shuffled = dataset.repeat(2).shuffle(buffer_size=100).batch(10)

print('Here are the item IDs near the epoch boundary:\n')
for n, line_batch in shuffled.skip(55).take(15):
  print(n.numpy())

In [0]:
repeat_shuffle = [n.numpy().mean() for n, line_batch in shuffled]

plt.plot(shuffle_repeat, label='shuffle().repeat()')
plt.plot(repeat_shuffle, label='repeat().shuffle()')
plt.ylabel('Mean item ID')
plt.legend()

## Decoding image data and resizing it
When training a neural network on real-world image data, it is often necessary to convert images of different sizes to a common size, so that they may be bached into a fized size.

In [0]:
list_ds = tf.data.Dataset.list_files(str(flowers_root/'*/*'))

In [0]:
for f in list_ds.take(5):
  print(f.numpy())

In [0]:
# Read an image from a file, decodes it into a dense tensor,
# and resizes it to a fixed shape.
def parse_image(filename):
  parts = tf.strings.split(filename, '/')
  label = parts[-2]

  image = tf.io.read_file(filename)
  image = tf.image.decode_jpeg(image)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, [128, 128])
  return image, label

In [0]:
# Test that it works.
file_path = next(iter(list_ds))
image, label = parse_image(file_path)

In [0]:
def show(image, label):
  plt.figure()
  plt.imshow(image)
  plt.title(label.numpy().decode('utf-8'))
  plt.axis('off')

In [0]:
show(image, label)

In [0]:
# Map it over the dataset
image_ds = list_ds.map(parse_image)

for image, label in image_ds.take(3):
  show(image, label)

### Applying arbitrary Python logic

For performance reasons, we encourage you to use TensorFlow operations for preprocessing your data whenevert possible. However, it is sometimes usefult to call external Python libraries when parsing your input data. You can use the `tf.py_function()` operation in a `Dataset.map()` transoformation

In [0]:
# To demonstrate `tf.py_function`,
# try using the `scipy.ndimage.rotate` function instead:
import scipy.ndimage as ndimage

def random_rotate_image(image):
  image = ndimage.rotate(image, np.random.uniform(-30, 30), reshape=False)
  return image 

In [0]:
image, label = next(iter(image_ds))
image = random_rotate_image(image)
show(image, label)

To use this function with `Dataset.map` the same caveats apply as with `Dataset.from_generator`, you need to describe toe return shapes and types when you apply the function.

In [0]:
def tf_random_rotate_image(image, label):
  im_shape = image.shape
  [image, ] = tf.py_function(random_rotate_image, [image], [tf.float32])
  image.set_shape(im_shape)
  return image, label

In [0]:
rot_ds = image_ds.map(tf_random_rotate_image)

for image, label in rot_ds.take(2):
  show(image, label)

### Parsing tf.Example protocol buffer message

Many input  pipelines extract `tf.train.Example` protocol buffer messages from a TFRecord format. Each `tf.train.Example` record contains one or more 'features', and the input pipeline typically converts these features into tensors.

In [0]:
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")
dataset = tf.data.TFRecordDataset(filenames=[fsns_test_file])
dataset

You can work with `tf.train.Example` protos outside of a `tf.data.Dataset` to understand data.

In [0]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())

feature = parsed.features.feature
raw_image = feature['image/encoded'].bytes_list.value[0]
img = tf.image.decode_png(raw_image)
plt.imshow(img)
plt.axis('off')
_ = plt.title(feature['image/text'].bytes_list.value[0])

In [0]:
raw_example = next(iter(dataset))

In [0]:
def tf_parse(eg):
  example = tf.io.parse_example(
      eg[tf.newaxis], {
          'image/encoded': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
          'image/text': tf.io.FixedLenFeature(shape=(), dtype=tf.string)
      })
  return example['image/encoded'][0], example['image/text'][0]

In [0]:
img, txt = tf_parse(raw_example)
print(txt.numpy())
print(repr(image.numpy()[:20]), '...')

In [0]:
decoded = dataset.map(tf_parse)
decoded

In [0]:
image_batch, text_batch = next(iter(decoded.batch(10)))
image_batch.shape

## Time series windowing
Time series data is often organized with the time axis intact.


In [0]:
# Use a simple `Dataset.range` to demonstrate:
range_ds = tf.data.Dataset.range(100000)

Typically, models based on this sort of data will want a contiguous time slice.
The simplest approact would be to batch the data:

In [0]:
# Using batch
batches = range_ds.batch(10, drop_remainder=True)

for batch in batches.take(5):
  print(batch.numpy())

## Skip Time series windowing