In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np

# 1. Overview

The `tf.data` API introduces two abstractions to TensorFlow:
* `tf.data.Dataset`: represents a sequence of elements, in which each element consists of one or more components. For example, in an image pipeline, an element might be a single training example, with a pair of tensor components representing the image and its label.

  There are two distinct ways to create a dataset:

    * A data <b>source</b> constructs a `Dataset` from data stored in memory or in one or more files. For example,
      * `tf.data.Dataset.from_tensor_slice()`
      * `tf.data.Dataset.from_tensor()`
      * `tf.data.TFRecordDataset()`
      * `tf.data.experimental.make_csv_dataset()`
    * A data <b>transformation</b> constructs a dataset from one or more `tf.data.Dataset` objects. For example,
      * `Dataset.map()`
      * `Dataset.batch()`
      * `Dataset.repeat()`

* `tf.data.Iterator`: provides the main way to extract elements from a `Dataset`. The operation returned by `Iterator.get_next()` yields the next element of a `Dataset` when executed, and typically acts as the interface between input pipeline code and your model

# 2. Reading input data

To create an input pipeline, you must start with a data <b>source</b>.

## Numpy arrays

For example, to construct a `Dataset` from data in memory, you can use `tf.data.Dataset.from_tensors()` or `tf.data.Dataset.from_tensor_slices()`.

In [2]:
(images, labels), _ = tf.keras.datasets.mnist.load_data()
numpy_dataset = tf.data.Dataset.from_tensor_slices((images, labels))
numpy_dataset

<DatasetV1Adapter shapes: ((28, 28), ()), types: (tf.uint8, tf.uint8)>

## TFRecord data

Alternatively, if your input data is stored in a file in the recommended TFRecord format, you can use `tf.data.TFRecordDataset()`.

In [3]:
tfrecord_dataset = tf.data.TFRecordDataset(os.path.join('data', 'images.tfrecords'))
tfrecord_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

## CSV data

* The easy way: read the csv file into a `pandas.Dataframe` and use `tf.data.Dataset.from_tensor_slice()`

In [4]:
df = pd.read_csv(os.path.join('data', 'titanic_train.csv'), index_col=None)
df.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [5]:
titanic_slices = tf.data.Dataset.from_tensor_slices(df.to_dict('list'))
titanic_slices

<DatasetV1Adapter shapes: {survived: (), sex: (), age: (), n_siblings_spouses: (), parch: (), fare: (), class: (), deck: (), embark_town: (), alone: ()}, types: {survived: tf.int32, sex: tf.string, age: tf.float32, n_siblings_spouses: tf.int32, parch: tf.int32, fare: tf.float32, class: tf.string, deck: tf.string, embark_town: tf.string, alone: tf.string}>

<b>Note</b>: this method works well if your data fits in the memory. A more scalable approach is to load from disk as necessary, as below.

* A more scalable approach: using `tf.data.experimental.make_csv_dataset()` to read csv file from disk.

In [6]:
titanic_batches = tf.data.experimental.make_csv_dataset(os.path.join('data', 'titanic_train.csv'),
                                                        batch_size=4, 
                                                        label_name="survived")
titanic_batches.output_shapes

W0804 23:29:29.158137 18260 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\data\experimental\ops\readers.py:499: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
W0804 23:29:29.184829 18260 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\data\experimental\ops\readers.py:212: shuffle_and_repeat (from tensorflow.python.data.experimental.ops.shuffle_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by `tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take care of usin

(OrderedDict([('sex', TensorShape([Dimension(4)])),
              ('age', TensorShape([Dimension(4)])),
              ('n_siblings_spouses', TensorShape([Dimension(4)])),
              ('parch', TensorShape([Dimension(4)])),
              ('fare', TensorShape([Dimension(4)])),
              ('class', TensorShape([Dimension(4)])),
              ('deck', TensorShape([Dimension(4)])),
              ('embark_town', TensorShape([Dimension(4)])),
              ('alone', TensorShape([Dimension(4)]))]),
 TensorShape([Dimension(4)]))

## Text data

The `tf.data.TextLineDataset` provides an easy way to extract lines from one or more text files. Given one or more filenames, a `TextLineDataset` will produce one string-valued element per line of those files.

In [2]:
text_dataset = tf.data.TextLineDataset(filenames=[os.path.join('data', 'cowper.txt'), 
                                                  os.path.join('data', 'derby.txt'), 
                                                  os.path.join('data', 'butler.txt')])

text_dataset

<TextLineDatasetV1 shapes: (), types: tf.string>

# 3. Consuming elements of a `Dataset` using an `Iterator`

The most common way to consume values from a `Dataset` is to make an <b>iterator</b> object that provides access to one element of the dataset at a time.

A `tf.data.Iterator` provides two operations: 
* `Iterator.initializer`, which enables you to (re)initialize the iterator's state; and 
* `Iterator.get_next()`, which returns `tf.Tensor` objects that correspond to the symbolic next element.

## Creating an iterator

Once you have `Dataset` to represent your input data, you can create an `Iterator` to access elements from that dataset.

The `tf.data` API currently supports the following iterators, in increasing level of sophistication:
* <b>one-shot</b>,
* <b>initializable</b>,
* <b>reinitializable</b>, and
* <b>feedable</b>.

We will only cover <b>one-shot</b> iterator for simplicity reason. See https://www.tensorflow.org/guide/datasets#creating_an_iterator for more information about the remaining iterators.

A <b>one-shot</b> iterator is the simplest form of iterator, which only supports iterating once through a dataset, with no need for explicit initialization.

In [7]:
numpy_iterator = numpy_dataset.make_one_shot_iterator() # create a one-shot iterator for numpy_dataset

W0804 23:29:29.244840 18260 deprecation.py:323] From <ipython-input-7-ed5bc28481c1>:1: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


## Consuming values from an iterator

* The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that correspond to the symbolic next element of an iterator. Each time these tensors are evaluated, they take the value of the next element in the underlying dataset.
* If the iterator reaches the end of the dataset, executing the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`.

In [8]:
next_element = numpy_iterator.get_next()

with tf.Session() as sess:
    count = 0
    while True:
        count += 1
        try:
            result = sess.run(next_element)
            if count == 1:
                print('First element label: ', result[1])
                print('Image shape: ', result[0].shape)
                print('Iabel shape: ', result[1].shape)
            if (count % 10000) == 0:
                print('Iterated through {} elements...'.format(count))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

First element label:  5
Image shape:  (28, 28)
Iabel shape:  ()
Iterated through 10000 elements...
Iterated through 20000 elements...
Iterated through 30000 elements...
Iterated through 40000 elements...
Iterated through 50000 elements...
Iterated through 60000 elements...
End of dataset.


## Consuming other data sources

* Consuming TFRecord:

In [9]:
tfrecord_iterator = tfrecord_dataset.make_one_shot_iterator() # Create an iterator
next_element = tfrecord_iterator.get_next()
with tf.Session() as sess:
    print(sess.run(next_element)) # print raw encoded tf.train.Example message

b'\n\xa0\x8c\x01\n\x0e\n\x05depth\x12\x05\x1a\x03\n\x01\x03\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x00\n\xd9\x8b\x01\n\timage_raw\x12\xca\x8b\x01\n\xc6\x8b\x01\n\xc2\x8b\x01\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x06\x04\x05\x06\x05\x04\x06\x06\x05\x06\x07\x07\x06\x08\n\x10\n\n\t\t\n\x14\x0e\x0f\x0c\x10\x17\x14\x18\x18\x17\x14\x16\x16\x1a\x1d%\x1f\x1a\x1b#\x1c\x16\x16 , #&\')*)\x19\x1f-0-(0%()(\xff\xdb\x00C\x01\x07\x07\x07\n\x08\n\x13\n\n\x13(\x1a\x16\x1a((((((((((((((((((((((((((((((((((((((((((((((((((\xff\xc0\x00\x11\x08\x00\xd5\x01@\x03\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1c\x00\x00\x02\x02\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x04\x02\x05\x01\x06\x07\x08\x00\xff\xc4\x00?\x10\x00\x02\x01\x03\x03\x02\x05\x01\x05\x06\x05\x02\x06\x03\x00\x00\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13"Qaq\x07\x142\x81\x91\x08#B\xa1\xb1\xf0\x15R\xc1\xd1\xf1b\xe1\x16$3Cr\x824S\xb2\xff\xc4\x00\x1a\x01\x00\x03\x01\x01\x01\x01\x00\x00\x0

* Consuming CSV data

In [10]:
csv_iterator = titanic_batches.make_one_shot_iterator() # Create an iterator
next_element = csv_iterator.get_next()
with tf.Session() as sess:
    print(sess.run(next_element)) # print a batch of 4 rows

(OrderedDict([('sex', array([b'male', b'male', b'male', b'male'], dtype=object)), ('age', array([18., 46., 34., 32.], dtype=float32)), ('n_siblings_spouses', array([0, 1, 0, 0])), ('parch', array([0, 0, 0, 0])), ('fare', array([11.5   , 61.175 ,  6.4958,  7.8958], dtype=float32)), ('class', array([b'Second', b'First', b'Third', b'Third'], dtype=object)), ('deck', array([b'unknown', b'E', b'unknown', b'unknown'], dtype=object)), ('embark_town', array([b'Southampton', b'Southampton', b'Southampton', b'Southampton'],
      dtype=object)), ('alone', array([b'y', b'n', b'y', b'y'], dtype=object))]), array([0, 0, 0, 0]))


* Consuming text data:

  Print the first 5 lines of the first file:

In [3]:
text_iterator = text_dataset.make_one_shot_iterator()
next_element = text_iterator.get_next()
with tf.Session() as sess:
    for i in range(5): # Print the first 5 lines of the first file
        print(sess.run(next_element))

W0805 22:55:06.777168  6556 deprecation.py:323] From <ipython-input-3-83849d40f485>:1: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b'His wrath pernicious, who ten thousand woes'
b"Caused to Achaia's host, sent many a soul"
b'Illustrious into Ades premature,'
b'And Heroes gave (so stood the will of Jove)'


To alternate lines between files use `Dataset.interleave()`, this makes it easier to shuffle files together. Here are the first, second and third lines from each translation:

In [7]:
files_ds = tf.data.Dataset.from_tensor_slices([os.path.join('data', 'cowper.txt'), 
                                               os.path.join('data', 'derby.txt'), 
                                               os.path.join('data', 'butler.txt')])
lines_ds = files_ds.interleave(tf.data.TextLineDataset, cycle_length=3)

lines_iterator = lines_ds.make_one_shot_iterator()
next_element = lines_iterator.get_next()
with tf.Session() as sess:
    for i in range(9):
        if i % 3 == 0:
            print()
            if i == 0:
                print('First line of each file: ')
            elif i == 3:
                print('Second line of each file: ')
            elif i == 6:
                print('Third line of each file: ')
        print(sess.run(next_element))


First line of each file: 
b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'

Second line of each file: 
b'His wrath pernicious, who ten thousand woes'
b'The vengeance, deep and deadly; whence to Greece'
b'countless ills upon the Achaeans. Many a brave soul did it send'

Third line of each file: 
b"Caused to Achaia's host, sent many a soul"
b'Unnumbered ills arose; which many a soul'
b'hurrying down to Hades, and many a hero did it yield a prey to dogs and'


# 4. Transforming `Dataset`

Once you have a `Dataset` object, you can transform it into a new `Dataset` by chaining method calls on the `tf.data.Dataset` object.

For example, you can apply <b>per-element</b> transformations such as `Dataset.map()`, and <b>multi-element</b> transformations such as `Dataset.batch()`.

## Batching dataset elements

The simplest form of batching stacks `n` consecutive elements of a dataset into a single element. The `Dataset.batch()` transformation does exactly this, with the constraint that for each component <i>i</i>, all elements must have a tensor of the exact same shape.

In [11]:
print('numpy dataset original output shapes: ',numpy_dataset.output_shapes)
batched_dataset = numpy_dataset.batch(4)
print('batched dataset output shapes', batched_dataset.output_shapes)

numpy dataset original output shapes:  (TensorShape([Dimension(28), Dimension(28)]), TensorShape([]))
batched dataset output shapes (TensorShape([Dimension(None), Dimension(28), Dimension(28)]), TensorShape([Dimension(None)]))


While `tf.data` tries to propagate shape information, the default settings of `Dataset.batch()` results in an unknown batch size because the last batch may not be full. Note the `None` in the shape.

Use the `drop_remainder` argument to ignore that last batch, and get full shape propagation:

In [12]:
batched_dataset = numpy_dataset.batch(7, drop_remainder=True)
print('batched dataset output shapes', batched_dataset.output_shapes)

batched dataset output shapes (TensorShape([Dimension(7), Dimension(28), Dimension(28)]), TensorShape([Dimension(7)]))


## Processing multiple epochs

The simplest way to iterate over a dataset in multiple epochs is to use the `Dataset.repeat()` transformation. Its argument `count` represents the number of times the dataset should be repeated. If `count` is `None` or `-1` (default behavior), the dataset is repeated indefinitely.

In [15]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]])) # create a source
dataset = dataset.repeat(count=2) # repeat this dataset twice

iterator = dataset.make_one_shot_iterator() # create an iterator
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[1]
[2]
[3]
[1]
[2]
[3]
End of dataset.


The `Dataset.repeat()` transformation concatenates its arguments without signaling the end of one epoch and the beginning of the next epoch. Because of this a `Dataset.batch()` applied after `Dataset.repeat()` will yield batched that stradle epoch boundaries:

In [17]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]]))
dataset = dataset.repeat(count=2).batch(2) # repeat this dataset twice, then batch the dataset elements

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[[1]
 [2]]
[[3]
 [1]]
[[2]
 [3]]
End of dataset.


If you need clear epoch separation, put `Dataset.batch()` before the repeat:

In [18]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]]))
dataset = dataset.batch(2).repeat(count=2) # batch the dataset elements first, then repeat twice

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[[1]
 [2]]
[[3]]
[[1]
 [2]]
[[3]]
End of dataset.


## Randomly shuffling input data

The `Dataset.shuffle()` transformation passes the input dataset through a random shuffle queue, `tf.queues.RandomShuffleQueue`. It maintains a fixed-size buffer and chooses the next element uniformly at random from that buffer.

In [19]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]])) # create a source
dataset = dataset.shuffle(buffer_size=10) # shuffle this dataset

iterator = dataset.make_one_shot_iterator() # create an iterator
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[2]
[1]
[3]
End of dataset.


<b>Note</b>: As with `Dataset.batch()` the order relative to `Dataset.repeat()` matters. So a shuffle placed before a repeat will show every element of one epoch before moving to the next:

In [25]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]]))
dataset = dataset.shuffle(buffer_size=10, seed=1).batch(2).repeat(count=2)

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[[2]
 [3]]
[[1]]
[[2]
 [1]]
[[3]]
End of dataset.


But a repeat before a shuffle mixes the epoch boundaries together:

In [26]:
dataset = tf.data.Dataset.from_tensor_slices(np.array([[1],[2],[3]]))
dataset = dataset.repeat(count=2).shuffle(buffer_size=10, seed=1).batch(2)

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

[[2]
 [3]]
[[2]
 [3]]
[[1]
 [1]]
End of dataset.


To summarize, <b>put `repeat()` after `shuffle()` and `batch()` to separate the epoch boundaries.</b>

## Preprocessing data with `Dataset.map()`

The `Dataset.map(f)` transformation produces a new dataset by applying a given function `f` to each element of the input dataset.

The function `f` takes the `tf.Tensor` objects that represent a single element in the input, and returns the `tf.Tensor` objects that will represent a single element in the new dataset.

* A simple example:

In [31]:
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 8]])
dataset = dataset.map(lambda x: tf.reduce_sum(x)) # ==> [3, 12]

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

3
12
End of dataset.


* Applying arbitrary Python logic with `tf.py_function()`:

In [34]:
def tf_sum(x):
    y = tf.py_function(func=lambda x: np.sum(x.numpy()), # Tensors inside this func are EagerTensors
                                                         # Use .numpy() to convert EagerTensors to their numpy values
                       inp=[x], # inputs, list of Tensor objects
                       Tout=tf.float32) # a list of tensorflow datatypes  
                                        # or a single datatype corresponding to `func` output
    return y

dataset = tf.data.Dataset.from_tensor_slices([[1.0, 2.0], [3.0, 4.0]])
dataset = dataset.map(tf_sum) # ==> [3.0, 12.0]

iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

W0805 18:11:49.526238 13696 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W0805 18:11:49.564190 17184 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32


3.0
12.0
End of dataset.


<b>Note</b>: Another option is relying on `AutoGraph` to convert Python code into an equivalent graph computation. It generally has better performance than `tf.py_function()`. The downside of this approach is that `AutoGraph` can convert some but not all Python code. For more information see this guide https://www.tensorflow.org/guide/autograph.

* Using `Dataset.map()` to parse `tf.train.Example` protocol buffer messages:

For more details, see <u>Tensorflow tf.Example and TFRecord.ipynb</u> and <u>Read and write image data using TFRecord.ipynb</u>

In [38]:
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# create a tf.train.Example message
example_proto = tf.train.Example(features=tf.train.Features(feature={
    'feature0': _int64_feature(False),
    'feature1': _int64_feature(4),
    'feature2': _bytes_feature(b'goat'),
    'feature3': _float_feature(0.9876),
    }))
print(example_proto)

# serialize the message
example_proto = example_proto.SerializeToString()

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876000285148621
      }
    }
  }
}



In [43]:
# Create a description of the features.
feature_description = {
    'feature0': tf.FixedLenFeature([], tf.int64, default_value=0),
    'feature1': tf.FixedLenFeature([], tf.int64, default_value=0),
    'feature2': tf.FixedLenFeature([], tf.string, default_value=''),
    'feature3': tf.FixedLenFeature([], tf.float32, default_value=0.0),
}

def _parse_function(example_proto):
    # Parse the input tf.Example proto using the dictionary above.
    return tf.parse_single_example(example_proto, feature_description)

In [45]:
raw_dataset = tf.data.TFRecordDataset.from_tensor_slices([example_proto])
parsed_dataset = raw_dataset.map(_parse_function)

iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    while True:
        try:
            print(sess.run(next_element))
        except tf.errors.OutOfRangeError:
            print("End of dataset.")
            break

{'feature0': 0, 'feature1': 4, 'feature2': b'goat', 'feature3': 0.9876}
End of dataset.
