In [70]:
import os
import tarfile
import urllib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import models, optimizers, losses, layers, metrics, callbacks
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Features, Feature, Example

import tensorflow_datasets as tfds

# Loading and Preprocessing Data With Tensorflow

## The Data API

In [2]:
x = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(x)

for item in dataset:
    print(item)

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


2022-01-21 18:07:49.623811: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-21 18:07:49.624171: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Chaining Transformations

In [3]:
dataset = dataset.repeat(3).batch(7, drop_remainder=True)

for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


In [4]:
dataset = dataset.map(lambda x: x * 2)

for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)


2022-01-21 18:07:49.697011: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [5]:
# Unbatch the items
dataset = dataset.apply(tf.data.experimental.unbatch())

for item in dataset:
    print(item)

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=

In [6]:
dataset = dataset.filter(lambda x: x < 10)

for item in dataset.take(5):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


### Shuffling the Data

In [7]:
dataset = tf.data.Dataset.range(10).repeat(5)
dataset = dataset.shuffle(buffer_size=7, seed=15).batch(7, drop_remainder=True)

for item in dataset:
    print(item)

tf.Tensor([4 3 7 5 8 0 2], shape=(7,), dtype=int64)
tf.Tensor([1 1 0 5 3 9 9], shape=(7,), dtype=int64)
tf.Tensor([0 4 8 6 2 4 7], shape=(7,), dtype=int64)
tf.Tensor([5 6 7 6 0 1 9], shape=(7,), dtype=int64)
tf.Tensor([2 4 6 8 5 3 0], shape=(7,), dtype=int64)
tf.Tensor([9 7 3 2 1 8 3], shape=(7,), dtype=int64)
tf.Tensor([1 7 6 8 4 5 2], shape=(7,), dtype=int64)


#### Interleaving lines from multiple files

In [8]:
# Let's Download the housing data such as chapter 2 
    
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'data/raw/housing/housing.tgz'
    
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [9]:
fetch_housing_data()
housing_data = load_housing_data()

housing_num_features = housing_data.drop(columns=['ocean_proximity'])

housing_train, housing_test = train_test_split(housing_num_features, test_size=0.2)
housing_train, housing_valid = train_test_split(housing_train, test_size=0.2)

In [10]:
housing_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
4048,-118.53,34.14,28.0,6920.0,906.0,2515.0,860.0,9.2189,500001.0
13766,-117.08,34.08,34.0,45.0,11.0,39.0,14.0,3.0625,500001.0
2442,-119.61,36.57,42.0,2242.0,521.0,1359.0,483.0,1.5833,65100.0
2279,-119.78,36.78,31.0,2164.0,456.0,959.0,463.0,2.3293,73400.0
4183,-118.23,34.13,48.0,737.0,166.0,462.0,131.0,3.5893,212500.0


In [11]:
def save_csv(dataset, delta=500, type_file='train'):
    
    dir_path = 'data/raw/housing'
    os.makedirs(dir_path, exist_ok=True)
    
    for i in range(0, dataset.shape[0], delta):
        
        data = dataset.iloc[i : i + delta]
        data.to_csv(f'{dir_path}/{type_file}_{int(i / 500)}.csv', header=False, index=False)

In [12]:
save_csv(housing_train)
save_csv(housing_valid, type_file='valid')
save_csv(housing_test, type_file='test')

In [13]:
train_filepath = 'data/raw/housing/train*.csv'
valid_filepath = 'data/raw/housing/valid*.csv'
test_filepath = 'data/raw/housing/test*.csv'

In [14]:
# Calculate the mean and the std of the features of x 

x_mean = np.mean(housing_num_features.iloc[:, :-1].to_numpy(), axis=0)
x_std = np.std(housing_num_features.iloc[:, :-1].to_numpy(), axis=0)
n_inputs = 8

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    
    return (x - x_mean) / x_std, y


### Putting Everything Together

In [15]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, shuffle_buffer_size=10000, batch_size=32):
    
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath),
        cycle_length=n_readers,
        num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).cache()
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

## Using the Dataset with tf.keras

In [16]:
# Load the data
train_set = csv_reader_dataset(train_filepath)
valid_set = csv_reader_dataset(valid_filepath)
test_set = csv_reader_dataset(test_filepath)

In [17]:
model = models.Sequential([
    layers.Flatten(),
    layers.Dense(30, activation='elu', kernel_initializer='he_normal'),
    layers.Dense(30, activation='elu', kernel_initializer='he_normal'),
    layers.Dense(1, activation='relu', kernel_initializer='he_normal')
])

optimizer = optimizers.SGD(learning_rate=0.01, momentum=0.99, nesterov=True)

model.compile(optimizer=optimizer,
              loss='mse',
              metrics = [metrics.mean_squared_error])

early_stopping_cb = callbacks.EarlyStopping(patience=15, restore_best_weights=True)
cb_list = [early_stopping_cb]

batch_size = 1500

history_model = model.fit(train_set,
                          validation_data=valid_set,
                          epochs=300,
                          batch_size=1500,
                          callbacks=cb_list)

Epoch 1/300
      1/Unknown - 0s 368ms/step - loss: 49549185024.0000 - mean_squared_error: 49549185024.0000

2022-01-21 18:07:50.861758: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


    411/Unknown - 3s 6ms/step - loss: 55755128832.0000 - mean_squared_error: 55755128832.0000

2022-01-21 18:07:53.930780: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300


In [18]:
evaluation = model.evaluate(test_set)

print(evaluation)

[56601210880.0, 56601210880.0]


## The TFRecord Format

In [19]:
# Create a tfrecord file

with tf.io.TFRecordWriter('data/raw/my_data.tfrecord') as f:
    f.write(b'This is the first record.')
    f.write(b'And this is the second record.')

In [20]:
filepaths = ['data/raw/my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=tf.data.AUTOTUNE)

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record.', shape=(), dtype=string)
tf.Tensor(b'And this is the second record.', shape=(), dtype=string)


### Compressed TFRecord Files

In [21]:
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('data/raw/my_data.tfrecord', options) as f:
    f.write(b'This is the first record.')
    f.write(b'And this is the second record.')

dataset = tf.data.TFRecordDataset(filepaths, compression_type='GZIP')
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record.', shape=(), dtype=string)
tf.Tensor(b'And this is the second record.', shape=(), dtype=string)


### Tensorflow Protobufs

In [22]:
# Create an example of protobuf in tensorflow

person_example = Example(
    features = Features(
        feature = {
            'name' : Feature(bytes_list=BytesList(value=[b'Marlon'])),
            'id' : Feature(int64_list = Int64List(value=[123])),
            'emails' : Feature(bytes_list=BytesList(value=[b'marlon.menendezg@gmail.com',
                                                           b'marlon.menendez1506@gmail.com']))
}))

# Serialize and write a TFRecord file
with tf.io.TFRecordWriter('data/raw/contats.tfrecord') as f:
    f.write(person_example.SerializeToString())

### Loading and Parsing Examples

In [23]:
# Parse example by example

feature_description = {
    'name' : tf.io.FixedLenFeature([], tf.string, default_value=''),
    'id' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'emails' : tf.io.VarLenFeature(tf.string),
}

for serialized_example in tf.data.TFRecordDataset('data/raw/contats.tfrecord'):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    

# Parse batch by batch 

dataset = tf.data.TFRecordDataset('data/raw/contats.tfrecord').batch(10)
for serialized_examples in dataset:
    parsed_examples = tf.io.parse_example(serialized_examples, feature_description)

## Preprocessing the Input Features

In [28]:
class Standardization(layers.Layer):
    
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
        self.epsilon_ = 1e-7
    
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + self.epsilon_)

In [29]:
std_layer = Standardization()
std_layer.adapt(housing_train.to_numpy())

### Encoding Categorical Features Using One-Hot Vectors

In [30]:
vocab = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [46]:
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'INLAND', '<1H OCEAN', 'POLAR', 'FOREST', 'INLAND', 'POLAR', 'NEAR OCEAN'])
cat_indices = table.lookup(categories)
print(cat_indices)

cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
print(cat_one_hot)

tf.Tensor([3 5 1 1 0 5 5 1 5 2], shape=(10,), dtype=int64)
tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0.]], shape=(10, 7), dtype=float32)


In [53]:
text_vec_layer = layers.TextVectorization(output_mode='multi_hot')

text_vec_layer.adapt(vocab)
tensor = text_vec_layer(categories)
print(tensor)

tf.Tensor(
[[0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0.]], shape=(10, 7), dtype=float32)


2022-01-21 20:17:21.835093: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


### Encoding Categorical Features Using Embeddings

In [54]:
# Manual encoding

embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
print(embedding_matrix)

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.9632548 , 0.29368246],
       [0.3882507 , 0.04600167],
       [0.26282084, 0.794824  ],
       [0.9262692 , 0.6907414 ],
       [0.11559474, 0.3930539 ],
       [0.70262873, 0.61465704],
       [0.8635504 , 0.00665975]], dtype=float32)>


In [61]:
categories = tf.constant(['NEAR BAY', 'DESERT', 'INLAND', 'INLAND', '<1H OCEAN', 'POLAR', 'FOREST', 'INLAND', 'POLAR', 'NEAR OCEAN'])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([3, 5, 1, 1, 0, 5, 5, 1, 5, 2])>

In [62]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[0.9262692 , 0.6907414 ],
       [0.70262873, 0.61465704],
       [0.3882507 , 0.04600167],
       [0.3882507 , 0.04600167],
       [0.9632548 , 0.29368246],
       [0.70262873, 0.61465704],
       [0.70262873, 0.61465704],
       [0.3882507 , 0.04600167],
       [0.70262873, 0.61465704],
       [0.26282084, 0.794824  ]], dtype=float32)>

In [63]:
# Encoding with a layer

embedding_layer = layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
                                   output_dim=embedding_dim)
embedding_layer(cat_indices)

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[ 0.04262692,  0.01907415],
       [ 0.02026287,  0.01146571],
       [-0.01117493, -0.04539983],
       [-0.01117493, -0.04539983],
       [ 0.04632548, -0.02063175],
       [ 0.02026287,  0.01146571],
       [ 0.02026287,  0.01146571],
       [-0.01117493, -0.04539983],
       [ 0.02026287,  0.01146571],
       [-0.02371792,  0.0294824 ]], dtype=float32)>

In [69]:
regular_inputs = layers.Input(shape=[8])
categories = layers.Input(shape=[], dtype=tf.string)
cat_indices = layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs = layers.concatenate([regular_inputs, cat_embed])
outputs = layers.Dense(1)(encoded_inputs)

model = models.Model(inputs=[regular_inputs, categories],
                     outputs=[outputs])

## TF Transform

## The TensorFlow Datasets (TFDS) Project

In [71]:
dataset = tfds.load(name='mnist')
mnist_train, mnist_test = dataset['train'], dataset['test']

2022-01-21 21:08:10.626171: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 11.06 MiB (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /Users/mmenendezg/tensorflow_datasets/mnist/3.0.1...[0m


Dl Completed...: 100%|██████████| 4/4 [00:03<00:00,  1.33 file/s]

[1mDataset mnist downloaded and prepared to /Users/mmenendezg/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m





In [None]:
dataset = tfds.load(name='mnist', batch_size=32, as_supervised=True)
mnist_train = dataset['train'].prefetch(1)

## Exercises

1. Why would you want to use the Data API?

> The Data API allows to load and preprocess large datasets in an efficient way such as loading, repeating, batching, shuffling and prefetching the data. Also enables the multithreading when preprocessing the data. This allows the GPU to be used more efficiently while the CPU is working on the next batch. Furthermore, the Data API allows to read data from different sources.

2. What are the benefits of splitting a large dataset into multiple files?

> When splitting the dataset into multiple files, we can load from multiple files at the same time and interleave the records. This will shuffle more the data than being shuffled with the memory due to the limitations of the memory. This also breaks some patterns that are created by chance with consecutive instances. 

3. During training, how can you tell that your input pipeline is the bottleneck? What can you do to fix it?

> You can use TensorBoard to visualize profiling data: if the GPU is not fully utilized then your input pipeline is likely to be the bottleneck. You can fix it by making sure it reads and preprocesses the data in multiple threads in parallel, and ensuring it prefetches a few batches. If this is insufficient to get your GPU to 100% usage during training, make sure your preprocessing code is optimized. You can also try saving the dataset into multiple TFRecord files, and if necessary perform some of the preprocessing ahead of time so that it does not need to be done on the fly during training (TF Transform can help with this). If necessary, use a machine with more CPU and RAM, and ensure that the GPU bandwidth is large enough.

4. Can you save any binary data to a TFRecord file, or only serialized protocol buffers?

> Any binary format can be saved into a TFRecord file. The serialized protocol buffers (protobufs) are preferred due to its efficiency, portability and extensibility.

5. Why would you go through the hassle of converting all your data to the Example protobuf format? Why not use your own protobuf definition?

> The Example protobuf format has the advantage that TensorFlow provides some operations to parse it (the tf.io.parse*example() functions) without you having to define your own format.

6. When using TFRecords, when would you want to activate compression? Why not do it systematically?

> We would prefer to use compression when we will transfer the data through network or when the storage is very limited. Activating the compression sistematically when no needed may waste CPU processing to decompress the files.

7. Data can be preprocessed directly when writing the data files, or within the tf.data pipeline, or in preprocessing layers within your model, or using TF Transform. Can you list a few pros and cons of each option?

> 

8. Name a few common techniques you can use to encode categorical features. What about text?

> For categorical features one-hot encoding would be one great option when the total categories are just a few. However, when the categories are a lot, embeddings may be a better option. 
> For text the previous two options are still valid, but there may be another such as TF-IDF (that is the counting of words divided by the total of words), bag of words that is just the counting of the words in a corpus.

### Questions 9 and 10

9. Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a train‐ ing set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized Example protobuf with two features: the serialized image (use tf.io.serialize_tensor() to serialize each image), and the label.11 Then use tf.data to create an efficient dataset for each set. Finally, use a Keras model to train these datasets, including a preprocessing layer to standardize each input feature. Try to make the input pipeline as efficient as possible, using TensorBoard to visualize profiling data.

10. In this exercise you will download a dataset, split it, create a tf.data.Dataset to load it and preprocess it efficiently, then build and train a binary classification model containing an Embedding layer:
- Download the Large Movie Review Dataset, which contains 50,000 movies reviews from the Internet Movie Database. The data is organized in two direc‐ tories, train and test, each containing a pos subdirectory with 12,500 positive reviews and a neg subdirectory with 12,500 negative reviews. Each review is stored in a separate text file. There are other files and folders (including pre‐ processed bag-of-words), but we will ignore them in this exercise.

- Split the test set into a validation set (15,000) and a test set (10,000).

- Use tf.data to create an efficient dataset for each set.

- Create a binary classification model, using a TextVectorization layer to preprocess each review. If the TextVectorization layer is not yet available (or if you like a challenge), try to create your own custom preprocessing layer: you can use the functions in the tf.strings package, for example lower() to make everything lowercase, regex_replace() to replace punctuation with spaces, and split() to split words on spaces. You should use a lookup table to output word indices, which must be prepared in the adapt() method.

- Add an Embedding layer and compute the mean embedding for each review, multiplied by the square root of the number of words (see Chapter 16). This rescaled mean embedding can then be passed to the rest of your model.

- Train the model and see what accuracy you get. Try to optimize your pipelines to make training as fast as possible.

- Use TFDS to load the same dataset more easily: tfds.load("imdb_reviews").