In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=8, micro=2, releaselevel='final', serial=0)
matplotlib 3.2.1
numpy 1.18.5
pandas 1.0.4
sklearn 0.23.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [4]:
# 使用tf.data.Dataset.from_tensor_slices，在内存中构建dataset
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))


print(dataset)
print(type(dataset))  # 类型为TensorSliceDataset

<TensorSliceDataset shapes: (), types: tf.int64>
<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


In [5]:
# 遍历dataset，每一个对象均为tensor
for i in dataset:
    print(i)
    print(type(i))  # 类型为EagerTensor

tf.Tensor(0, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(1, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(2, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(3, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(4, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(5, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(6, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(7, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(8, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(9, shape=(), dtype=int64)
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [6]:
# .repeat()表示该数据集可以被重复获取的次数
# .batch()表示每次获取数据的数据量
# 对数据集进行分割
dataset = dataset.repeat(2).batch(5)

for item in dataset:
    print(item)
#     print(type(item))  # 类型为EagerTensor

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


In [7]:
# interleave() 其实就是拆解数据，获取数据的一个方法
# 从Dataset中取出cycle_length个element，并应用apply map_func, 得到cycle_length个新的Dataset对象
# 从新生成的Dataset中取数据，每个对象一次取block_length个数据直到取尽后，再从原Dataset中再取一个element
# 然后apply map_func，以此类推

dataset2 = dataset.interleave(
    # 使用lambda提供map_func
    lambda v: tf.data.Dataset.from_tensor_slices(v),
    # 应用map_func的数据数量
    cycle_length=3,
    # 从新生成的数据集获取数据数量
    block_length=2
)

# print(type(dataset2))  # 数据类型为InterleaveDataset

i = 0
for item in dataset2:
    i += 1
    print(item)
print(i)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
20


In [15]:
# 输入的参数是元祖的情况下，以tensor类型放在同一个元组对象中

x = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array(['cat', 'dog', 'fox'])

dataset3 = tf.data.Dataset.from_tensor_slices((x, y))
print(dataset3)
print(type(dataset3))  # 类型为TensorSliceDataset

for item_x, item_y in dataset3:
    print(item_x.numpy(), item_y.numpy())

for item in dataset3:
    print(item)

<TensorSliceDataset shapes: ((2,), ()), types: (tf.int64, tf.string)>
<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'
(<tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 2])>, <tf.Tensor: shape=(), dtype=string, numpy=b'cat'>)
(<tf.Tensor: shape=(2,), dtype=int64, numpy=array([3, 4])>, <tf.Tensor: shape=(), dtype=string, numpy=b'dog'>)
(<tf.Tensor: shape=(2,), dtype=int64, numpy=array([5, 6])>, <tf.Tensor: shape=(), dtype=string, numpy=b'fox'>)


In [26]:
# 输入参数是字典的情况下

dataset4 = tf.data.Dataset.from_tensor_slices({'feature': x, 'label': y})

print(dataset4)
print(type(dataset4))

for item in dataset4:
    print(item['feature'].numpy(), item['label'].numpy())
    
for item in dataset4:
    print(item)
    print(type(item['feature']))
    print(type(item['feature'].numpy()))

<TensorSliceDataset shapes: {feature: (2,), label: ()}, types: {feature: tf.int64, label: tf.string}>
<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'
{'feature': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 2])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'cat'>}
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'numpy.ndarray'>
{'feature': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([3, 4])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'dog'>}
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'numpy.ndarray'>
{'feature': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([5, 6])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'fox'>}
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'numpy.ndarray'>


TypeError: 'TensorSliceDataset' object is not subscriptable