# The Data API

In [None]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

In [None]:
for item in dataset:
  print(item)

## Chaining Transformations

In [None]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
  print(item)

In [None]:
dataset = dataset.map(lambda x: x * 2)

In [None]:
dataset = dataset.apply(tf.data.experimental.unbatch())

In [None]:
dataset = dataset.filter(lambda x: x < 10)

In [None]:
for item in dataset.take(3):
  print(item)

## Shuffling the Data

In [None]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
  print(item)

### Interleaving Lines From Multiple Files

In [None]:
train_filepaths = ["..", ".."]

filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [None]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length = n_readers
)

for line in dataset.take(5):
  print(line.numpy())

### Preprocessing the Data

In [None]:
X_mean, X_std = [...]
n_inputs = 8

def preprocess(line):
  defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
  fields = tf.io.decode_csv(line, record_defaults=defs)
  x = tf.stack(fields[:-1])
  y = tf.stack(fields[-1:])
  return (x - X_mean) / X_std, y

In [None]:
preprocess(b"4.2083, 44.0, 5.3232, 0.9171, 846.0, 2.3370, 37.47, -122.2, 2.782")

### Putting Everything Together

In [None]:
def csv_reader_dataset(filepaths, repeat=None, n_readers=5,
                       n_read_thread=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
  dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
  dataset = dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length = n_readers, num_parallel_calls=n_read_threads)
  dataset = dataset.shuffle(shuffle_buffer_size)
  dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
  dataset = dataset.batch(batch_size)
  return dataset.prefetch(1)

### Prefetching

### Using the Dataset With tf.keras

In [None]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [None]:
model = keras.models.Sequential([...])
model.compile([...])
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data = valid_set,
          validation_steps = len(X_valid) // batch_size)

In [None]:
model.evaluate(test_set, steps=len(X_test) // batch_size)
model.predic(new_set, steps=len(X_new) // batch_size)

In [None]:
def train(model, optimizer, loss_fn, n_epochs, [...]):
  train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, [...])
  for X_batch, y_batch in train_set:
    with tf.GradientTape() as tape:
      y_pred = model(X_batch)
      main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
      loss = tf.add_n([main_loss] + model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

## The TFRecord Format

In [None]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
  f.write(b"This is the first record")
  f.write(b"And this is the second record")

In [None]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
  print(item)

### Compressed TFRecord Files

In [None]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
  [...]

In [None]:
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],
                                  compression_type="GZIP")

### A Brief Introduction to Protocol Buffers

In [None]:
syntax = "proto3";
message Person {
    string name = 1;
    int32 id = 2;
    repeated string email = 3;
}

from person_pb2 import Person
person = Person(name="Al", id=123, email=["a@b.com"])
print(person)

In [None]:
person.name

In [None]:
person.name = "Alice"

In [None]:
person.email[0]

In [None]:
person.email.append("c@d.com")
s = person.SerializeToString()
s

In [None]:
person2 = Person()
person2.ParseFromString(s)

In [None]:
person == person2

## The Features API

In [None]:
housing_median_age = tf.feature_column.numeric_column("housing_median_age")

In [None]:
age_mean, age_std = X_mean[1], X_std[1]
housing_median_age = tf.feature_column.numeric_column(
    "housing_median_age", normalizer_fn=lambda x: (x - age_mean) / age_std)

In [None]:
median_income = tf.feature_column.numeric_column("median_income")
bucketized_income = tf.feature_column.bucketized_column(
    median_income, boundaries=[1.5, 3., 4.5, 6.]
)

## Categorical Features

In [None]:
ocean_prox_vocab = ["<1H OCEAN", "INLAND", "ISLAND", "NEAR BAY", "NEAR OCEAN"]
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(
    "ocean_proximity", ocean_prox_vocab
)

In [None]:
city_hash = tf.feature_column.categorical_column_with_hash_bucket(
    "city", hash_bucket_size=1000
)

## Crossed Categorical Features

In [None]:
bucketized_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.])
age_and_ocean_proximity = tf.feature_column.crossed_column(
    [bucketized_age, ocean_proximity], hash_bucket_size=100)

In [None]:
latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=list(np.linspace(32., 42., 20 - 1))
)
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=list(np.linspace(-125., -114., 20 - 1))
)
location = tf.feature_column.crossed_column(
    [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000
)

### Encoding Categorical Features Using One-Hot Vectors

In [None]:
ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

### Encoding Categorical Features Using Embeddings

In [None]:
ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,
                                                           dimension=2)

### Using Feature Columns for Parsing

In [None]:
columns = [bucketized_age, ......, median_house_value] # all features + target
feature_descriptions = tf.feature_column.make_parse_example_spec(columns)

In [None]:
def parse_examples(serialized_examples):
  examples = tf.io.parse_example(serialized_examples, feature_descriptions)
  targets = examples.pop("median_house_value")
  return examples, targets

In [None]:
batch_size = 32
dataset = tf.data.TFRecordDataset(["my_data_with_features.tfrecords"])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)

### Using Feature Columns in Your Models

In [None]:
columns_without_target = columns[:-1]
model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns=columns_without_target),
    keras.layers.Dense(1)
])
model.compile(loss="mse", optimizer="sgd", metrics=["accuracy"])
steps_per_epoch = len(X_train) // batch_size
history = model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=5)

In [None]:
some_columns = [ocean_proximity_embed, bucketized_income]
dense_features = keras.layers.DenseFeatures(some_columns)
dense_features({
    "ocean_proximity": [["NEAR OCEAN"], ["INLAND"], ["INLAND"]],
    "median_income": [[3.], [7.2], [1.]]
})

# TF Transform

In [None]:
import tensorflow_transform as tft

def preprocess(inputs): # inputs is a batch of input features
  median_age = inputs["housing_median_age"]
  ocean_proximity = inputs["ocean_proximity"]
  standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))
  ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
  return {
      "standardized_median_age": standardized_age,
      "ocean_proximity_id": ocean_proximity_id
  }

# The TensorFlow Datasets (TFDS) Project

In [None]:
import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]

In [None]:
mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)
for item in mnist_train:
  images = item["image"]
  labels = item["label"]
  ...

In [None]:
mnist_train = mnist_train.repeat(5).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

In [None]:
dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset["train"].repeat().prefetch(1)
model = keras.models.Sequential([...])
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd")
model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5)