In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [None]:
# Data API

In [15]:
# Convert tensor into a Dataset
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [16]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [17]:
# Chaining Transformation
dataset = dataset.repeat(3).batch(7)   #duplicate 3 times, divide into bacthes of 7
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [18]:
# map() method
#(used for preprocessing)
dataset = dataset.map(lambda x: x*2)
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [19]:
dataset = dataset.apply(tf.data.experimental.unbatch())   # split into separate tensors
#dataset = tf.data.Dataset.unbacth(dataset) in case of decrepiation

In [20]:
#(map() applies on each item, apply() applies on the dataset as a whole)

In [21]:
dataset = dataset.filter(lambda x: x<10)

In [22]:
for item in dataset.take(3):    # take n samples
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [23]:
# Shuffling data
#(draw samples from the dataset to a buffer, and then load randomly from the buffer)

In [24]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


In [25]:
#(for large dataset, shuffle them from the source, split into multiple files, load in random order)
#(for more shuffling, read files at them same time, interleaving the records, then add a shuffling buffer)

In [None]:
# Interleaving lines
#(doesnt run)
n_readers = 5
dataset = filepath_dataset.interleave(     # filepath_dataset is a hypothetical dataset of file paths
        lamb filepath: tf.data.TextLineDataset(filepath).skip(1),    # skips header row
        cycle_length=n_readers)

In [26]:
# Preprocessing data
n_inputs = 8   # number of features

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]   # define default for missing values
    fields = tf.io.decode_csv(line, record_defaults=defs)   # decode_csv returns a list of scalars
    x = tf.stack(fields[:-1])   # convert the line of scalars into 1d array
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y   # scaling

In [27]:
# Everything
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                      n_read_threads=None, shuffle_buffer_size=10000,
                      n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length = n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [None]:
# Example import
#(doesnt run)
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [28]:
# Whole pipeline
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.losses import mean_squared_error

batch_size = 32
loss_fn = mean_squared_error
optimizer = Nadam(lr=0.01)

@tf.function
def train(model, n_epochs, batch_size=32, n_readers=5, n_read_threads=5, 
         shuffle_buffer_size=10000, parse_hreads=5):
    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,
                                  n_read_threads=n_read_threads, 
                                  shuffle_buffer_size=shuffle_buffer_size,
                                  n_parse_threads=n_parse_threads, bach_size=batch_size)
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

#train(model, n_epochs=50)

In [29]:
# TFRecord Format
#(tfrecord is more efficient than csv, and supports large or complex data like image or audio well)

In [3]:
# Write and read tfrecord
with tf.io.TFRecordWriter('my_data.tfrecord') as f:
    f.write(b'First record')
    f.write(b'Second record')

filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'First record', shape=(), dtype=string)
tf.Tensor(b'Second record', shape=(), dtype=string)


In [4]:
# Write and read compressed tfrecord
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_compressed.tfrecord', options) as f:
    f.write(b'First record')
    f.write(b'Second record')
dataset = tf.data.TFRecordDataset(['my_compressed.tfrecord'],
                                 compression_type='GZIP')

In [5]:
# Protocol Buffers
#()

In [6]:
################################################################

In [7]:
# To do

In [8]:
################################################################

In [9]:
# Preprocessing the Input Features

In [None]:
# Standardization layer
#(equivalent of of sklearn standardscaler())
#(equivalent of keras.layers.Normalization)
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backens.epsilon())

std_layer = Standardization()
std_layer.adapt(data)   # fit to the data

model = Sequential()
model.add(std_layer)   # use in model

In [5]:
# One hot encoding
#(useful for <10 categories)
# Creating the encoder
#(equivalent of keras.layers.TextVectorization)
vocab = ['OCEAN', 'INLAND', 'NEAR_OCEAN', 'NEAR_BAY', 'ISLAND']
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2   # for categories not in vocab
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)   # the look up table to be used
# Applying to dataset
categories = tf.constant(['NEAR_BAY', 'DESERT', 'INLAND', 'INLAND'])
cat_id = table.lookup(categories)
cat_one_hot = tf.one_hot(cat_id, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: id=23, shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [6]:
# Embedding 
#(for >50 categories)
#(learn a target vector representation of n dimensions)

In [7]:
# Manual embedding layer
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.35980713, 0.68890977],
       [0.32505107, 0.20529878],
       [0.57066727, 0.38657212],
       [0.49282324, 0.46532512],
       [0.21857   , 0.6337639 ],
       [0.19484985, 0.3122424 ],
       [0.75659215, 0.30861425]], dtype=float32)>

In [9]:
categories = tf.constant(['NEAR_BAY', 'DESERT', 'INLAND', 'INLAND'])
cat_id = table.lookup(categories)
cat_id

<tf.Tensor: id=45, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [10]:
tf.nn.embedding_lookup(embedding_matrix, cat_id)

<tf.Tensor: id=47, shape=(4, 2), dtype=float32, numpy=
array([[0.49282324, 0.46532512],
       [0.19484985, 0.3122424 ],
       [0.32505107, 0.20529878],
       [0.32505107, 0.20529878]], dtype=float32)>

In [12]:
# Keras embedding layer
embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
                                  output_dim=embedding_dim)
embedding(cat_id)

<tf.Tensor: id=61, shape=(4, 2), dtype=float32, numpy=
array([[-0.02672741,  0.0292397 ],
       [-0.03435816, -0.02178714],
       [ 0.0432584 , -0.03545972],
       [ 0.0432584 , -0.03545972]], dtype=float32)>

In [15]:
# Full model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Lambda, Embedding, Dense, concatenate

regular_input = Input(shape=[8])
categories = Input(shape=[], dtype=tf.string)
cat_id = Lambda(lambda cats: table.lookup(cats))(categories)   # alternative = TextVectorization().adapt(categories)
cat_embed = Embedding(input_dim=7, output_dim=2)(cat_id)
encoded_input = concatenate([regular_input, cat_embed])
outputs = Dense(1)(encoded_input)

model = Model(inputs=[regular_input, categories],
             outputs=[outputs])

In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None,)              0           input_6[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 2)            14          lambda_2[0][0]                   
______________________________________________________________________________________________

In [18]:
# Keras Preprocessing Layers

In [None]:
# Chaining preprocessing layers
from tensorflow.keras.layers import Normalization, Discretization, PreprocessingStage

normalization = Normalization()
disretization = Discretization()   # binning continuous variable
pipeline = PreprocessingStage([normalization, discretization])
pipeline.adapt(data)

In [19]:
#(for custom preprocessing class, subclass PreprocessingStage with a adapt() method, takes in 'data'
# and binary 'reset_state')

In [20]:
# TF Transform
#(takes a function written with tf transform functions, outputs a equivalent tensorflow 
#preprocessing function to plug into model)

In [21]:
#todo: install tensorflow_transform