In [1]:
import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow.data

np.set_printoptions(precision=4)

In [2]:
# tf.tensor, TF中基本都是张量在进行流转
# Compute some values using a Tensor
c = tf.constant([[1.0, 2.0],
                 [3.0, 4.0]])
d = tf.constant([[1.0, 1.0],
                 [0.0, 1.0]])
e = tf.matmul(c, d)
print(e)

tf.Tensor(
[[1. 3.]
 [3. 7.]], shape=(2, 2), dtype=float32)


## #tf.data.dataset.from_tensor_slices


In [3]:
# 二位数组，拆除第一维
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [3, 4]])
print(list(dataset.as_numpy_iterator()))
# 对于元组来说，拆除的是元组内部的每一项的维度，最后再次合成元组，要把元组中的数据理解成一组数据
dataset = tf.data.Dataset.from_tensor_slices(([1, 2], [3, 4], [5, 6]))
print(list(dataset.as_numpy_iterator()))
# 拆除第一维，就是拆外面的数组
dataset = tf.data.Dataset.from_tensor_slices([(1, 2), (3, 4), (5, 6)])
print(list(dataset.as_numpy_iterator()))
# 参数是dict，拆除dict的value的维度
dataset = tf.data.Dataset.from_tensor_slices({"a": [1, 2], "b": [3, 4]})
print(list(dataset.as_numpy_iterator()))
# dataset = tf.data.Dataset.from_tensor_slices({"a": (1, 2), "b": (3, 4)}) 这个会报错，说明拆不了元组
# 参数是元组，拆里面每一项，每一项是dict，拆dict的value， ({a:1, b:3},{c:5}) , ({a:2, b:4},{c:6})
dataset = tf.data.Dataset.from_tensor_slices(({"a": [1, 2], "b": [3, 4]}, {"c": [5, 6]}))
print(list(dataset.as_numpy_iterator()))
# 太离谱了，这特么也可以 ({'a': {'aa': 1}, 'b': {'bb': 3}}, {'c': {'cc': 5}}), ({'a': {'aa': 2}, 'b': {'bb': 4}}, {'c': {'cc': 6}})
dataset = tf.data.Dataset.from_tensor_slices(({"a": {"aa": [1, 2]}, "b": {"bb": [3, 4]}}, {"c": {"cc": [5, 6]}}))
print(list(dataset.as_numpy_iterator()))

# Two tensors can be combined into one Dataset object.
features = tf.constant([[1, 3], [2, 1], [3, 3]])  # ==> 3x2 tensor
labels = tf.constant(['A', 'B', 'A'])  # ==> 3x1 tensor
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
# Both the features and the labels tensors can be converted
# to a Dataset object separately and combined after.
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))
# A batched feature and label set can be converted to a Dataset
# in similar fashion.
batched_features = tf.constant([[[1, 3], [2, 3]],
                                [[2, 1], [1, 2]],
                                [[3, 3], [3, 2]]], shape=(3, 2, 2))
batched_labels = tf.constant([['A', 'A'],
                              ['B', 'B'],
                              ['A', 'B']], shape=(3, 2, 1))
dataset = tf.data.Dataset.from_tensor_slices((batched_features, batched_labels))
for element in dataset.as_numpy_iterator():
    print(element)


[array([1, 2]), array([3, 4])]
[(1, 3, 5), (2, 4, 6)]
[array([1, 2]), array([3, 4]), array([5, 6])]
[{'a': 1, 'b': 3}, {'a': 2, 'b': 4}]
[({'a': 1, 'b': 3}, {'c': 5}), ({'a': 2, 'b': 4}, {'c': 6})]
[({'a': {'aa': 1}, 'b': {'bb': 3}}, {'c': {'cc': 5}}), ({'a': {'aa': 2}, 'b': {'bb': 4}}, {'c': {'cc': 6}})]
(array([[1, 3],
       [2, 3]]), array([[b'A'],
       [b'A']], dtype=object))
(array([[2, 1],
       [1, 2]]), array([[b'B'],
       [b'B']], dtype=object))
(array([[3, 3],
       [3, 2]]), array([[b'A'],
       [b'B']], dtype=object))


## 基本机制
tf.data API 引入了一个 tf.data.Dataset 抽象，它表示一个元素序列，其中每个元素都由一个或多个组件组成。例如，在一个图像流水线中，一个元素可能是一个训练样本，有一对表示图像及其标签的张量组件。
听起来比较像 'List<Object>'

In [8]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0])
print(dataset)
print(dataset.element_spec)

for elem in dataset:
    print(elem)
    print(elem.numpy())
print("--------------------------------------------------")
## reduce是根据多个Tensor最后生成一个Tensor
print(dataset.reduce(0, lambda state, value: state + value))
print(dataset.reduce(0, lambda state, value: state + value).numpy())

<TensorSliceDataset shapes: (), types: tf.int32>
TensorSpec(shape=(), dtype=tf.int32, name=None)
tf.Tensor(8, shape=(), dtype=int32)
8
tf.Tensor(3, shape=(), dtype=int32)
3
tf.Tensor(0, shape=(), dtype=int32)
0
tf.Tensor(11, shape=(), dtype=int32)
11


In [13]:
# random.uniform是产生一个4 * 10的随机数的Tensor
print(tf.random.uniform([4, 10]))
# 切片第一维，结果是一个，一堆 10维的向量，的数据集
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))
print(dataset1) # dateSet
print(dataset1.element_spec)

tf.Tensor(
[[0.0453 0.405  0.2163 0.2185 0.6849 0.9378 0.7461 0.0604 0.3186 0.2357]
 [0.0149 0.329  0.4757 0.9239 0.9556 0.5419 0.1715 0.3097 0.9043 0.2356]
 [0.3267 0.0428 0.2049 0.4435 0.4263 0.6065 0.9213 0.9657 0.8955 0.9726]
 [0.1252 0.403  0.3521 0.8339 0.1415 0.7611 0.7627 0.8215 0.7089 0.2036]], shape=(4, 10), dtype=float32)
<TensorSliceDataset shapes: (10,), types: tf.float32>
TensorSpec(shape=(10,), dtype=tf.float32, name=None)


In [18]:
# 这里展现的是，‘对于dateset而言，每个元素都是由多个或者一个组件组成'，这句话的含义，这里一组就是一整个元组，里面一个数字，一个100维向量
dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random.uniform([4]),
    tf.random.uniform([4, 100], maxval=100, dtype=tf.int32)))
print(dataset2)
print(dataset2.element_spec)

<TensorSliceDataset shapes: ((), (100,)), types: (tf.float32, tf.int32)>
(TensorSpec(shape=(), dtype=tf.float32, name=None), TensorSpec(shape=(100,), dtype=tf.int32, name=None))


#### tf.data.Dataset.zip()讲解

In [21]:
# The nested structure of the `datasets` argument determines the
# structure of elements in the resulting dataset.
a = tf.data.Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
b = tf.data.Dataset.range(4, 7)  # ==> [ 4, 5, 6 ]
print(a)  # DateSet
ds = tf.data.Dataset.zip((a, b))
print(ds) # dateSet
print("------------------------------------------------------")
print(list(ds.as_numpy_iterator()))
print("------------------------------------------------------")
ds = tf.data.Dataset.zip((b, a))
print(list(ds.as_numpy_iterator()))
print("------------------------------------------------------")

# The `datasets` argument may contain an arbitrary number of datasets.
c = tf.data.Dataset.range(7, 13).batch(2)  # ==> [ [7, 8],
                                           #       [9, 10],
                                           #       [11, 12] ]
print(c)
print(c.element_spec)
print(list(c.as_numpy_iterator()))
print("----------------------------------------------------")
ds = tf.data.Dataset.zip((a, b, c))
for element in ds.as_numpy_iterator():
  print(element)


<RangeDataset shapes: (), types: tf.int64>
<ZipDataset shapes: ((), ()), types: (tf.int64, tf.int64)>
------------------------------------------------------
[(1, 4), (2, 5), (3, 6)]
------------------------------------------------------
[(4, 1), (5, 2), (6, 3)]
------------------------------------------------------
<BatchDataset shapes: (None,), types: tf.int64>
TensorSpec(shape=(None,), dtype=tf.int64, name=None)
[array([7, 8], dtype=int64), array([ 9, 10], dtype=int64), array([11, 12], dtype=int64)]
----------------------------------------------------
(1, 4, array([7, 8], dtype=int64))
(2, 5, array([ 9, 10], dtype=int64))
(3, 6, array([11, 12], dtype=int64))
