# TFRecord 與 Dataset 測試

## 程式參考來源：
- https://www.tensorflow.org/tutorials/load_data/tfrecord
- https://medium.com/%E8%BB%9F%E9%AB%94%E4%B9%8B%E5%BF%83/%E9%81%A9%E5%90%88%E5%A4%A7%E9%87%8F%E8%B3%87%E6%96%99i-o%E7%9A%84%E5%84%B2%E5%AD%98%E6%A0%BC%E5%BC%8F-tfrecord%E7%B0%A1%E4%BB%8B%E8%88%87%E6%93%8D%E4%BD%9C%E6%95%99%E5%AD%B8-cd27e50d51ee

## 建立 TFRecord

In [1]:
import tensorflow as tf
import numpy as np
import IPython.display as display

## 定義 tf.train.Feature 轉換函數

In [2]:
# 下列函數可轉換為 tf.train.Example 的 tf.train.Feature
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

## 簡單測試

In [3]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_int64_feature(True))
print(_int64_feature(1))

bytes_list {
  value: "test_string"
}

bytes_list {
  value: "test_bytes"
}

float_list {
  value: 2.7182817459106445
}

int64_list {
  value: 1
}

int64_list {
  value: 1
}



## 序列化(serialization)測試

In [4]:
# 序列化(serialization)
feature = _float_feature(np.exp(1))
feature.SerializeToString()

b'\x12\x06\n\x04T\xf8-@'

## 建立測試資料

In [5]:
# 建立tf.train.Example訊息，含 4 個 feature

# The number of observations in the dataset.
n_observations = int(1e4)

# Boolean feature, encoded as False or True.
feature0 = np.random.choice([False, True], n_observations)

# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)

# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# Float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)

## 定義tf.train.Example資料序列化函數

In [6]:
# 序列化(serialization)
def serialize(feature0, feature1, feature2, feature3):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
            'feature0': _int64_feature(feature0),
            'feature1': _int64_feature(feature1),
            'feature2': _bytes_feature(feature2),
            'feature3': _float_feature(feature3),
    }

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

## 序列化(serialization)測試

In [7]:
# 序列化
example_observation = []

serialized_example = serialize(False, 4, b'goat', 0.9876)
serialized_example

b'\nR\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'

## 由序列化的字串建立 tf.train.Example 訊息

In [8]:
# 建立 tf.train.Example 訊息
example_proto = tf.train.Example.FromString(serialized_example)
example_proto

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876000285148621
      }
    }
  }
}

## 將一筆記錄寫入 TFRecord 檔案

In [9]:
# 將一筆記錄寫入 TFRecord 檔案
with tf.io.TFRecordWriter("test.tfrecords") as writer:
    writer.write(serialized_example)

In [10]:
# 開啟 TFRecord 檔案
filenames = ["test.tfrecords"]
raw_dataset = tf.data.TFRecordDataset(filenames)

## 取得序列化的資料
for raw_record in raw_dataset.take(10):
    print(repr(raw_record))

<tf.Tensor: shape=(), dtype=string, numpy=b'\nR\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'>


## 取得反序列化(Deserialize)的資料

In [11]:
# 設定原始資料的欄位屬性
feature_description = {
        'feature0': tf.io.FixedLenFeature([], tf.int64, default_value=0),
        'feature1': tf.io.FixedLenFeature([], tf.int64, default_value=0),
        'feature2': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'feature3': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
}

# 將 tf.train.Example 訊息轉為 字典(dictionary)
def _parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

In [12]:
# 反序列化(Deserialize)
parsed_dataset = raw_dataset.map(_parse_function)

# 取得每一個欄位值
for parsed_record in parsed_dataset.take(10):
    print(repr(parsed_record))

{'feature0': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'feature1': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'feature2': <tf.Tensor: shape=(), dtype=string, numpy=b'goat'>, 'feature3': <tf.Tensor: shape=(), dtype=float32, numpy=0.9876000285148621>}


## TFRecord 實例測試

In [13]:
# 從網路上取的官網的 TFRecord 檔案
file_path = "https://storage.googleapis.com/download.tensorflow.org/" + \
            "data/fsns-20160927/testdata/fsns-00000-of-00001"
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", file_path)

# 顯示存檔位置
fsns_test_file

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001
[1m7904079/7904079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


'C:\\Users\\mikec\\.keras\\datasets\\fsns.tfrec'

In [14]:
# 讀取 TFRecord 檔案
dataset = tf.data.TFRecordDataset(filenames = [fsns_test_file])

# 取得下一筆資料
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())
parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}