In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=8, micro=2, releaselevel='final', serial=0)
matplotlib 3.2.1
numpy 1.18.5
pandas 1.0.4
sklearn 0.23.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [21]:
source_dir = './generate_csv/'

def get_filenames_by_prefix(source_dir, prefix_name):
    all_files = os.listdir(source_dir)
    results = []
    for filename in all_files:
        if filename.startswith(prefix_name):
            results.append(os.path.join(source_dir, filename))
    return results

train_filenames = get_filenames_by_prefix(source_dir, 'train')
valid_filenames = get_filenames_by_prefix(source_dir, 'valid')
test_filenames = get_filenames_by_prefix(source_dir, 'test')

import pprint

pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)

print(len(train_filenames))
print(len(valid_filenames))
print(len(test_filenames))


['./generate_csv/train_15.csv',
 './generate_csv/train_01.csv',
 './generate_csv/train_00.csv',
 './generate_csv/train_14.csv',
 './generate_csv/train_02.csv',
 './generate_csv/train_16.csv',
 './generate_csv/train_17.csv',
 './generate_csv/train_03.csv',
 './generate_csv/train_07.csv',
 './generate_csv/train_13.csv',
 './generate_csv/train_12.csv',
 './generate_csv/train_06.csv',
 './generate_csv/train_10.csv',
 './generate_csv/train_04.csv',
 './generate_csv/train_05.csv',
 './generate_csv/train_11.csv',
 './generate_csv/train_08.csv',
 './generate_csv/train_09.csv',
 './generate_csv/train_19.csv',
 './generate_csv/train_18.csv']
['./generate_csv/valid_01.csv',
 './generate_csv/valid_00.csv',
 './generate_csv/valid_02.csv',
 './generate_csv/valid_03.csv',
 './generate_csv/valid_07.csv',
 './generate_csv/valid_06.csv',
 './generate_csv/valid_04.csv',
 './generate_csv/valid_05.csv',
 './generate_csv/valid_08.csv',
 './generate_csv/valid_09.csv']
[]
20
10
0


In [18]:
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

# 创建解析默认参考类型的列表，一一对应
# 使用tf.io.decode_csv()将每一行都进行解析，得到这些数据组成的列表
# 使用tf.stack()将特征值和目标值拼接起来得到ndarray

def csv_reader_dataset(filenames, n_readers=5,
                      batch_size=32, n_parse_threads=5,
                      shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                         num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

# 读取csv文件，获取其中的数据，filenames是不同数据集文件切片组成的文件名数据集
# n_readers是使用interleave()时，循环获取数据的个数，batch_size是每批次获取数据个数，
# n_parse_threads是每行获取数据的数量，shuffle_buffer_size是数据混洗的容量

batch_size = 32
train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)


InvalidArgumentError: Expected 'tf.Tensor(False, shape=(), dtype=bool)' to be true. Summarized data: b'No files matched pattern: '

In [None]:
def serilizer_example(x, y):
    input_features = tf.train.FloatList(value=x)
    label = tf.train.FloatList(value=y)
    features = tf.train.Features(
        feature = {
            'input_features': tf.train.Feature(
                float_list=input_features),
            'lable': tf.train.Feature(float_list=label)
        }
    )
    
    example = tf.train.Example(features=features)
    return example.SerializerToString()


def csv_dataset_to_tfrecords(base_filename, dataset,
                            n_shard, steps_per_shard,
                            compression_type=None):
    options=tf.io.TFRecordOptions(
        compression_type=compression_type)
    all_filenames=[]
    
    for shard_id in range(n_shards):
        filename_fullpath = '{}_{:05d}-of-{:05d}'.format(
            base_filename, shard_id)
    
        with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
            for x_batch, y_batch in dataset.skip(shard_id * steps_per_shard).take(steps_per_shard):
                for x_example, y_example in zip(x_batch, y_batch):
                    writer.write(
                        serialize_example(x_example, y_example))
        all_filenames.append(filename_fullpath)
    
    return all_filenames

n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = 'generate_tfrecords'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

train_basename = os.path.join(output_dir, "train")
valid_basename = os.path.join(output_dir, "valid")
test_basename = os.path.join(output_dir, "test")    
    
train_tfrecord_filenames = csv_dataset_to_tfrecords(
    train_basename, train_set, n_shards, train_steps_per_shard,
    compression_type = "GZIP")
valid_tfrecord_filenames = csv_dataset_to_tfrecords(
    valid_basename, valid_set, n_shards, valid_steps_per_shard,
    compression_type = "GZIP")
test_tfrecord_fielnames = csv_dataset_to_tfrecords(
    test_basename, test_set, n_shards, test_steps_per_shard,
    compression_type = "GZIP")
    
    

In [None]:
pprint.pprint(train_tfrecord_filenames)
pprint.pprint(valid_tfrecord_filenames)
pprint.pprint(test_tfrecord_fielnames)