In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
print(dataset)
for item in dataset:
    print(item, item.numpy())

<TensorSliceDataset shapes: (), types: tf.int32>
tf.Tensor(0, shape=(), dtype=int32) 0
tf.Tensor(1, shape=(), dtype=int32) 1
tf.Tensor(2, shape=(), dtype=int32) 2
tf.Tensor(3, shape=(), dtype=int32) 3
tf.Tensor(4, shape=(), dtype=int32) 4
tf.Tensor(5, shape=(), dtype=int32) 5
tf.Tensor(6, shape=(), dtype=int32) 6
tf.Tensor(7, shape=(), dtype=int32) 7
tf.Tensor(8, shape=(), dtype=int32) 8
tf.Tensor(9, shape=(), dtype=int32) 9


In [3]:
dataset = dataset.repeat(3).batch(7, drop_remainder=True)
print(dataset)
for item in dataset:
    print(item, item.numpy())

<BatchDataset shapes: (7,), types: tf.int32>
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32) [0 1 2 3 4 5 6]
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32) [7 8 9 0 1 2 3]
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32) [4 5 6 7 8 9 0]
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32) [1 2 3 4 5 6 7]


In [4]:
dataset = dataset.map(lambda x: x * 2, num_parallel_calls=4)
print(dataset)
for item in dataset:
    print(item, item.numpy())

<ParallelMapDataset shapes: (7,), types: tf.int32>
tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32) [ 0  2  4  6  8 10 12]
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32) [14 16 18  0  2  4  6]
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32) [ 8 10 12 14 16 18  0]
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32) [ 2  4  6  8 10 12 14]


In [5]:
dataset = dataset.apply(tf.data.Dataset.unbatch)
print(dataset)


<_UnbatchDataset shapes: (), types: tf.int32>


In [6]:
dataset = dataset.filter(lambda x: x > 10)
print(dataset)
for item in dataset:
    print(item, item.numpy())

<FilterDataset shapes: (), types: tf.int32>
tf.Tensor(12, shape=(), dtype=int32) 12
tf.Tensor(14, shape=(), dtype=int32) 14
tf.Tensor(16, shape=(), dtype=int32) 16
tf.Tensor(18, shape=(), dtype=int32) 18
tf.Tensor(12, shape=(), dtype=int32) 12
tf.Tensor(14, shape=(), dtype=int32) 14
tf.Tensor(16, shape=(), dtype=int32) 16
tf.Tensor(18, shape=(), dtype=int32) 18
tf.Tensor(12, shape=(), dtype=int32) 12
tf.Tensor(14, shape=(), dtype=int32) 14


In [7]:
for item in dataset.take(3):
    print(item, item.numpy())

tf.Tensor(12, shape=(), dtype=int32) 12
tf.Tensor(14, shape=(), dtype=int32) 14
tf.Tensor(16, shape=(), dtype=int32) 16


In [8]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
print(dataset)
for item in dataset:
    print(item, item.numpy())

<BatchDataset shapes: (None,), types: tf.int64>
tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64) [0 2 3 6 7 9 4]
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64) [5 0 1 1 8 6 5]
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64) [4 8 7 1 2 3 0]
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64) [5 4 2 7 8 9 9]
tf.Tensor([3 6], shape=(2,), dtype=int64) [3 6]


### Split the California dataset to multiple CSV files
Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set, and finally we scale it:

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

X_train_all, X_test, y_train_all, y_test = \
    train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_
print('X_mean:', X_mean)
print('X_std:', X_std)

NameError: name 'y_train_all' is not defined

For a very large dataset that does not fit in memory, you will typically want to split it into many files first, then have TensorFlow read these files in parallel. To demonstrate this, let's start by splitting the housing dataset and save it to 20 CSV files:

In [None]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [None]:
import pandas as pd
pd.read_csv(test_filepaths[1]).head()