# 13. Loading and Preprocessing Data with TensorFlow

In [1]:
import tensorflow as tf

tf.enable_eager_execution()

In [2]:
X = tf.range(10)  # any data tensor

dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<DatasetV1Adapter shapes: (), types: tf.int32>

In [3]:
for item in dataset:
    print(item)

Instructions for updating:
Colocations handled automatically by placer.
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [4]:
dataset = dataset.repeat(3).batch(7)
>>> for item in dataset:
...     print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [5]:
dataset = dataset.map(lambda x: x * 2) # Items: [0,2,4,6,8,10,12]
dataset

<DatasetV1Adapter shapes: (?,), types: tf.int32>

In [6]:
>>> for item in dataset:
...     print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [7]:
dataset = dataset.apply(tf.data.experimental.unbatch()) # Items: 0,2,4,...
# Each item in the new dataset will be a single integer tensor instead of a batch of 7 integers

In [8]:
>>> for item in dataset:
...     print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, sh

In [9]:
dataset = dataset.filter(lambda x: x < 10) # Items: 0 2 4 6 8 0 2 4 6...

In [10]:
>>> for item in dataset:
...     print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [11]:
for item in dataset.take(3):
...     print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [12]:
dataset = tf.data.Dataset.range(10).repeat(3) # 0 to 9, three times

>>> dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)

>>> for item in dataset:
...     print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


### INTERLEAVING LINES FROM MULTIPLE FILES

In [20]:
train_filepaths = [ 'datasets/housing/train/housingaa.csv',
                    'datasets/housing/train/housingab.csv',
                    'datasets/housing/test/housingac.csv',
                    'datasets/housing/test/housingad.csv',
                    'datasets/housing/valid/housingae.csv',
                    'datasets/housing/valid/housingaf.csv']

In [21]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [22]:
n_readers = 5

dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)


In [24]:
for line in dataset.take(5):
    print(line.numpy())

b'-117.56,33.94,6.0,575.0,73.0,318.0,88.0,7.0215,257100.0,INLAND'
b'-118.13,33.86,37.0,2259.0,425.0,1183.0,413.0,5.1805,201600.0,<1H OCEAN'
b'-118.62,34.17,34.0,3268.0,538.0,1463.0,519.0,6.8482,308300.0,<1H OCEAN'
b'-122.47,37.75,52.0,1598.0,285.0,689.0,265.0,4.6071,337400.0,NEAR BAY'
b'-119.31,36.06,20.0,2236.0,434.0,1405.0,412.0,1.8827,48700.0,INLAND'


### Preprocessing the Data


In [35]:
import pandas as pd

In [70]:
df = pd.read_csv('datasets/housing/housing.csv')

In [92]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [123]:
X_train = pd.concat([pd.read_csv('datasets/housing/train/housingaa.csv'), pd.read_csv('datasets/housing/train/housingab.csv')], axis=0)
X_train.shape

(7999, 10)

In [75]:
# mean and scale of each feature in the training set
X_mean = df.loc[:, 'longitude':'median_income'].mean(axis=0).tolist()
X_std = df.loc[:, 'longitude':'median_income'].std(axis=0).tolist()

In [103]:
n_inputs = 8

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32), tf.constant([], dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    
    x = tf.stack(fields[:-2])
    y = tf.stack(fields[-2:-1])
    return (x - X_mean) / X_std, y

#### Putting Everything Together

In [104]:
def csv_reader_dataset(filepaths, repeat=None, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

### Using the Dataset With tf.keras

In [105]:
train_filepaths = [ 'datasets/housing/train/housingaa.csv',
                    'datasets/housing/train/housingab.csv'
]

valid_filepaths = [ 'datasets/housing/valid/housingac.csv',
                    'datasets/housing/valid/housingad.csv'
]

test_filepaths = [ 'datasets/housing/test/housingae.csv',
                    'datasets/housing/test/housingaf.csv'
]

In [106]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [110]:
for x,y in train_set.take(1):
    print(x.numpy(), y)

[[-1.30784070e+00  9.96340394e-01  1.85613656e+00 -2.80417532e-01
  -3.48542362e-01 -3.49218458e-01 -3.91127497e-01 -7.39422560e-01]
 [-9.83412802e-01  1.81096601e+00  7.43750393e-01 -5.49025834e-01
  -5.40765584e-01 -5.70859432e-01 -5.87293267e-01 -1.33789968e+00]
 [ 5.48881233e-01 -7.40589261e-01 -1.30267277e-01  8.09142113e-01
   1.17500472e+00  2.64488578e-01  1.20958507e+00  1.60188273e-01]
 [-1.40766287e+00  1.10870326e+00  3.46469641e-01 -1.18800199e+00
  -1.26219594e+00 -1.23843133e+00 -1.27779663e+00  1.33869916e-01]
 [-1.30784070e+00  1.01506782e+00  1.37939966e+00 -3.19837868e-01
  -4.67198670e-01 -5.80572784e-01 -4.38207269e-01  2.11613953e-01]
 [-5.69143474e-01  1.26788282e+00 -1.00428498e+00 -6.31991923e-01
  -7.28242576e-01 -7.65126407e-01 -7.49456942e-01  3.06570441e-01]
 [-1.38769770e+00  1.08997571e+00  4.25925791e-01 -5.53609610e-01
  -4.81437415e-01 -1.35524780e-01 -4.72209334e-01 -6.97208107e-01]
 [-1.30784070e+00  1.01038682e+00  1.29994345e+00  3.76953818e-02
  -

In [121]:
batch_size = 32

In [None]:
model = keras.models.Sequential([
    ...
])

model.compile([
    ...
])

model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=valid_set,
          validation_steps=len(X_train) // batch_size)

In [None]:
model.evaluate(test_set, steps=len(X_test) // batch_size)
model.predict(new_set, steps=len(X_new) // batch_size)

In [None]:
@tf.function
def train(model, optimizer, loss_fn, n_epochs, [...]):
    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, [...])
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

## The Features API

In [124]:
housing_median_age = tf.feature_column.numeric_column("housing_median_age")

In [125]:
age_mean, age_std = X_mean[1], X_std[1]  # The median age is column in 1

housing_median_age = tf.feature_column.numeric_column("housing_median_age", normalizer_fn=lambda x: (x - age_mean) / age_std)

In [126]:
median_income = tf.feature_column.numeric_column("median_income")

bucketized_income = tf.feature_column.bucketized_column(median_income, boundaries=[1.5, 3., 4.5, 6.])

#### Categorical Features

In [127]:
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list("ocean_proximity", ocean_prox_vocab)

If we had a "city" feature in the dataset, we could encode it like this:

In [128]:
city_hash = tf.feature_column.categorical_column_with_hash_bucket("city", hash_bucket_size=1000)
# This feature will compute a hash for each category (i.e., for each city), modulo the number of hash buckets (hash_bucket_size)

we need to set the number of buckets high enough to avoid getting too many collisions (i.e., different categories ending up in the same bucket), but the higher we set it, the more RAM will be used.

#### Crossed Categorical Features

In [129]:
bucketized_age = tf.feature_column.bucketized_column(housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled

age_and_ocean_proximity = tf.feature_column.crossed_column([bucketized_age, ocean_proximity], hash_bucket_size=100)

In [131]:
import numpy as np

In [132]:
latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")

bucketized_latitude = tf.feature_column.bucketized_column(latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))

location = tf.feature_column.crossed_column([bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)

#### Encoding Categorical Features Using One-Hot Vectors

There are two options to encode a categorical feature: one-hot vectors or embeddings.

In [133]:
ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

#### Encoding Categorical Features Using Embeddings

In [134]:
ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,dimension=2)

#### Using Feature Columns for Parsing

In [None]:
columns = [bucketized_age, ....., median_house_value] # all features + target
feature_descriptions = tf.feature_column.make_parse_example_spec(columns)

In [135]:
def parse_examples(serialized_examples):
    examples = tf.io.parse_example(serialized_examples, feature_descriptions)
    targets = examples.pop("median_house_value") # separate the targets
    return examples, targets

In [None]:
batch_size = 32
dataset = tf.data.TFRecordDataset(["my_data_with_features.tfrecords"])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)

### The TensorFlow Datasets (TFDS) Project

In [None]:
!pip install tensorflow-datasets

In [1]:
import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

[1mDownloading and preparing dataset mnist (11.06 MiB) to /Users/hakan/tensorflow_datasets/mnist/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=1, bar_style='info', description='Extraction completed...', max=1, style=Prog…







HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



HBox(children=(IntProgress(value=0, description='Shuffling...', max=10, style=ProgressStyle(description_width=…

W0525 22:45:12.600831 4399105472 deprecation.py:323] From /Users/hakan/OneDrive/Deep-Learning/deep-learning-notes/Hands-On Machine Learning with Scikit-Learn and TensorFlow/venv/lib/python3.7/site-packages/tensorflow_datasets/core/file_format_adapter.py:247: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=6000, style=ProgressStyle(description_width=…



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



HBox(children=(IntProgress(value=0, description='Shuffling...', max=1, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, description='Writing...', max=10000, style=ProgressStyle(description_width…

W0525 22:45:24.835080 4399105472 deprecation.py:323] From /Users/hakan/OneDrive/Deep-Learning/deep-learning-notes/Hands-On Machine Learning with Scikit-Learn and TensorFlow/venv/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:423: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


[1mDataset mnist downloaded and prepared to /Users/hakan/tensorflow_datasets/mnist/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
!pip install ipywidgets
!ip install widgetsnbextension
!pip install tqdm
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

In [None]:
mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)
for item in mnist_train:
    images = item["image"]
    labels = item["label"]
    [...]

In [None]:
mnist_train = mnist_train.repeat(5).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

or

In [None]:
dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)

mnist_train = dataset["train"].repeat().prefetch(1)

model = keras.models.Sequential([
    
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd")
model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5)