In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=8, micro=2, releaselevel='final', serial=0)
matplotlib 3.2.1
numpy 1.18.5
pandas 1.0.4
sklearn 0.23.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [4]:
# TFRecord格式是一种用于存储二进制记录序列的简单格式

# .encode()指定编码方式
favorite_books = [name.encode('utf-8') for name in ['machine learning', 'cc150']]
print(favorite_books)

favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)
print(type(favorite_books_bytelist))  # 类型为BytesList

[b'machine learning', b'cc150']
value: "machine learning"
value: "cc150"

<class 'tensorflow.core.example.feature_pb2.BytesList'>


In [7]:
hours_floatlist = tf.train.FloatList(value=[15.5, 6.9, 4.0, 5.3])
print(hours_floatlist)
print(type(hours_floatlist))  # 类型为FloatList

value: 15.5
value: 6.900000095367432
value: 4.0
value: 5.300000190734863

<class 'tensorflow.core.example.feature_pb2.FloatList'>


In [10]:
age_int64list = tf.train.Int64List(value=[42])
print(age_int64list)
print(type(age_int64list))  # 类型为Int64List

value: 42

<class 'tensorflow.core.example.feature_pb2.Int64List'>


In [15]:
features = tf.train.Features(
    feature = {
        'favorite_books': tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        'hours': tf.train.Feature(
            float_list = hours_floatlist),
        'age': tf.train.Feature(int64_list = age_int64list)
    }
)
print(features)
print(type(features))  # Features类似于json

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 6.900000095367432
      value: 4.0
      value: 5.300000190734863
    }
  }
}

<class 'tensorflow.core.example.feature_pb2.Features'>


In [16]:
example = tf.train.Example(features=features)
print(example)
print(type(example))  # Example

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 6.900000095367432
        value: 4.0
        value: 5.300000190734863
      }
    }
  }
}

<class 'tensorflow.core.example.example_pb2.Example'>


In [19]:
serialized_example = example.SerializeToString()
print(serialized_example)
print(type(serialized_example))  # bytes

b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\xcd\xcc\xdc@\x00\x00\x80@\x9a\x99\xa9@'
<class 'bytes'>


In [24]:
# 生成tfrecords文件

# 指定保存文件的目录
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

filename = 'test.tfrecords'
filename_fullpath = os.path.join(output_dir, filename)

# 使用tf.io.TFRecordWriter()
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [26]:
# 读取tfrecords文件，获取数据
dataset = tf.data.TFRecordDataset([filename_fullpath])

for serialized_example_tensor in dataset:
    print(serialized_example_tensor)
    print(type(serialized_example_tensor))  # EagerTensor

tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\xcd\xcc\xdc@\x00\x00\x80@\x9a\x99\xa9@', shape=(), dtype=string)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\xcd\xcc\xdc@\x00\x00\x80@\x9a\x99\xa9@', shape=(), dtype=string)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\xcd\xcc\xdc@\x00\x00\x80@\x9a\x99\xa9@', shape=(), dtype=string)
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [27]:
# 创建feature类型数据对象
expected_features = {
    'favorite_books': tf.io.VarLenFeature(dtype = tf.string),
    'hours': tf.io.VarLenFeature(dtype = tf.float32),
    'age': tf.io.FixedLenFeature([], dtype = tf.int64),
}

In [29]:
# 在指定目录下创建一个tfrecord数据集文件
dataset = tf.data.TFRecordDataset([filename_fullpath])

# 遍历数据集文件中的tensor容器，将feature序列为example对象
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features
    )
    print(example)
    
    # 使用.to_dense将example转换为稀疏矩阵
    books = tf.sparse.to_dense(example['favorite_books'],
                              default_value=b'')
    
    for book in books:
        print(book.numpy().decode('utf-8'))

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff76538a520>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff765533310>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff7658cd8b0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff765864b20>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff765533310>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7ff7651c1c10>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=42>}
machine learning
cc150


In [30]:
# 将tfrecord文件进行压缩
# 指定压缩后的文件名
filename_fullpath_zip = filename_fullpath + '.zip'

# 指定压缩方式
options = tf.io.TFRecordOptions(compression_type='GZIP')

with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [31]:
# 读取压缩文件
# 指定压缩文件和压缩方式
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],
                                     compression_type='GZIP')

for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example['favorite_books'],
                              default_value=b'')
    for book in books:
        print(book.numpy().decode('utf-8'))

machine learning
cc150
machine learning
cc150
machine learning
cc150
