# TextLineDataset 測試

In [1]:
import tensorflow as tf
import numpy as np

## 讀取三個語料庫檔案

In [2]:
# 讀取三個檔案
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_paths = [
    tf.keras.utils.get_file(file_name, directory_url + file_name)
    for file_name in file_names
]

# 合併為一資料集
ds = tf.data.TextLineDataset(file_paths)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
[1m815980/815980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
[1m809730/809730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt
[1m807992/807992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


## 讀取5筆資料

In [3]:
# 讀取5筆資料
for line in ds.take(5):
    print(line.numpy())

b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b'His wrath pernicious, who ten thousand woes'
b"Caused to Achaia's host, sent many a soul"
b'Illustrious into Ades premature,'
b'And Heroes gave (so stood the will of Jove)'


## 每個檔案輪流讀取

In [4]:
# interleave：每個檔案輪流讀取
files_ds = tf.data.Dataset.from_tensor_slices(file_paths)
lines_ds = files_ds.interleave(tf.data.TextLineDataset, cycle_length=3)

# 各讀 3 筆，共 9 筆
for i, line in enumerate(lines_ds.take(9)):
    if i % 3 == 0:
        print()
    print(line.numpy())


b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'

b'His wrath pernicious, who ten thousand woes'
b'The vengeance, deep and deadly; whence to Greece'
b'countless ills upon the Achaeans. Many a brave soul did it send'

b"Caused to Achaia's host, sent many a soul"
b'Unnumbered ills arose; which many a soul'
b'hurrying down to Hades, and many a hero did it yield a prey to dogs and'


## 建立測試資料

In [5]:
# 建立tf.train.Example訊息，含 4 個 feature

# The number of observations in the dataset.
n_observations = int(1e4)

# Boolean feature, encoded as False or True.
feature0 = np.random.choice([False, True], n_observations)

# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)

# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# Float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)

## 定義tf.train.Example資料序列化函數

In [6]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [7]:
# 序列化(serialization)
def serialize(feature0, feature1, feature2, feature3):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
            'feature0': _int64_feature(feature0),
            'feature1': _int64_feature(feature1),
            'feature2': _bytes_feature(feature2),
            'feature3': _float_feature(feature3),
    }

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

## 序列化(serialization)測試

In [8]:
# 序列化
example_observation = []

serialized_example = serialize(False, 4, b'goat', 0.9876)
serialized_example

b'\nR\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'

## 由序列化的字串建立 tf.train.Example 訊息

In [9]:
# 建立 tf.train.Example 訊息
example_proto = tf.train.Example.FromString(serialized_example)
example_proto

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876000285148621
      }
    }
  }
}

## 將一筆記錄寫入 TFRecord 檔案

In [10]:
# 將一筆記錄寫入 TFRecord 檔案
with tf.io.TFRecordWriter("test.tfrecords") as writer:
    writer.write(serialized_example)

In [11]:
# 開啟 TFRecord 檔案
filenames = ["test.tfrecords"]
raw_dataset = tf.data.TFRecordDataset(filenames)

## 取得序列化的資料
for raw_record in raw_dataset.take(10):
    print(repr(raw_record))

<tf.Tensor: shape=(), dtype=string, numpy=b'\nR\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?'>


## 取得反序列化(Deserialize)的資料

In [12]:
# 設定原始資料的欄位屬性
feature_description = {
        'feature0': tf.io.FixedLenFeature([], tf.int64, default_value=0),
        'feature1': tf.io.FixedLenFeature([], tf.int64, default_value=0),
        'feature2': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'feature3': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
}

# 將 tf.train.Example 訊息轉為 字典(dictionary)
def _parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

In [13]:
# 反序列化(Deserialize)
parsed_dataset = raw_dataset.map(_parse_function)

# 取得每一個欄位值
for parsed_record in parsed_dataset.take(10):
    print(repr(parsed_record))

{'feature0': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'feature1': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'feature2': <tf.Tensor: shape=(), dtype=string, numpy=b'goat'>, 'feature3': <tf.Tensor: shape=(), dtype=float32, numpy=0.9876000285148621>}


## TFRecord 實例測試

In [14]:
# 從網路上取的官網的 TFRecord 檔案
file_path = "https://storage.googleapis.com/download.tensorflow.org/" + \
            "data/fsns-20160927/testdata/fsns-00000-of-00001"
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", file_path)

# 顯示存檔位置
fsns_test_file

'C:\\Users\\mikec\\.keras\\datasets\\fsns.tfrec'

In [15]:
# 讀取 TFRecord 檔案
dataset = tf.data.TFRecordDataset(filenames = [fsns_test_file])

# 取得下一筆資料
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())
parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}

## 讀取鐵達尼文字檔案(.csv)，匯入至TextLineDataset

In [16]:
# 讀取鐵達尼文字檔案(.csv)，匯入至TextLineDataset
file_path = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
titanic_file = tf.keras.utils.get_file("train.csv", file_path)
titanic_lines = tf.data.TextLineDataset(titanic_file)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
[1m30874/30874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3us/step


In [17]:
# 讀取10筆資料
for line in titanic_lines.take(10):
    print(line.numpy())

b'survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone'
b'0,male,22.0,1,0,7.25,Third,unknown,Southampton,n'
b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y'
b'0,male,2.0,3,1,21.075,Third,unknown,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


## 篩選生存者的資料

In [18]:
# 篩選生存者的資料
def survived(line):
    return tf.not_equal(tf.strings.substr(line, 0, 1), "0")

# 篩選
survivors = titanic_lines.skip(1).filter(survived)

# 讀取10筆資料
for line in survivors.take(10):
    print(line.numpy())

b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'
b'1,male,28.0,0,0,13.0,Second,unknown,Southampton,y'
b'1,female,28.0,0,0,7.225,Third,unknown,Cherbourg,y'
b'1,male,28.0,0,0,35.5,First,A,Southampton,y'
b'1,female,38.0,1,5,31.3875,Third,unknown,Southampton,n'


## TextLineDataset 結合 DataFrame

In [19]:
import pandas as pd

df = pd.read_csv(titanic_file, index_col=None)
df.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [20]:
# 匯入 Dataset
ds = tf.data.Dataset.from_tensor_slices(dict(df))

# 讀取1筆資料
for feature_batch in ds.take(1):
    for key, value in feature_batch.items():
        print(f"{key:20s}: {value}")

survived            : 0
sex                 : b'male'
age                 : 22.0
n_siblings_spouses  : 1
parch               : 0
fare                : 7.25
class               : b'Third'
deck                : b'unknown'
embark_town         : b'Southampton'
alone               : b'n'
