# tf.data API

In [1]:
import tensorflow as tf

In [2]:
daily_sales_numbers = [21,22,-108, 31, 1, 32,24,31]

# Simple tf.data.Dataset object

- **from_tensor_slices** creates a dataset with a separate element  for each row of the input tensor
- **from_tensors** Combines the input and returns a dataset with a single element


In [3]:
# from tensors
array1 = [21,22,-108, 31, 1, 32,24,31]
from_tensors_dataset = tf.data.Dataset.from_tensors(array1)
print(from_tensors_dataset)
array2 = [[21,22,-108, 31, 1, 32,24,31],[21,22,-108, 31, 1, 32,24,31]]
from_tensors_dataset2 = tf.data.Dataset.from_tensors(array2)
print(from_tensors_dataset2)

Metal device set to: Apple M1
<TensorDataset shapes: (8,), types: tf.int32>
<TensorDataset shapes: (2, 8), types: tf.int32>


2021-11-04 13:25:28.774458: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-04 13:25:28.774595: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [5]:
print("you can iterate through out your dataset")
for sales in tf_dataset:
    print(sales.numpy())

print("you can use a numpy iterator")
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

print("you can select only a number of elements")
for sales in tf_dataset.take(3):
    print(sales)
    


you can iterate through out your dataset
21
22
-108
31
1
32
24
31
you can use a numpy iterator
21
22
-108
31
1
32
24
31
you can select only a number of elements
tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(-108, shape=(), dtype=int32)


## Filter values from your dataset

In [6]:
print("Filter negative values")
tf_dataset_non_negative = tf_dataset.filter(lambda x: x>0)
for sales in tf_dataset_non_negative:
    print(sales)

Filter negative values
tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(24, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


2021-11-04 13:25:28.808328: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-04 13:25:28.808451: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


# Map values from your dataset

In [7]:
def currency_mx(input_val):
    return input_val * 20

tf_dataset_mx = tf_dataset_non_negative.map(currency_mx)
for sales in tf_dataset_mx:
    print(sales)

tf.Tensor(420, shape=(), dtype=int32)
tf.Tensor(440, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(640, shape=(), dtype=int32)
tf.Tensor(480, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)


# Shuffle
how it works: https://stackoverflow.com/questions/53514495/what-does-batch-repeat-and-shuffle-do-with-tensorflow-dataset

In [8]:
tf_dataset_shuffled = tf_dataset_mx.shuffle(3)
for sales in tf_dataset_shuffled:
    print(sales)

tf.Tensor(620, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(640, shape=(), dtype=int32)
tf.Tensor(480, shape=(), dtype=int32)
tf.Tensor(420, shape=(), dtype=int32)
tf.Tensor(440, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)


# Batching
Great for distributed environments

In [9]:
tf_dataset_batched = tf_dataset_mx.batch(3)
for sales_batch in tf_dataset_batched:
    print(sales_batch.numpy())

[420 440 620]
[ 20 640 480]
[620]


# Putting everything together

In [10]:
tf_dataset_processed = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset_processed = tf_dataset_processed.filter(lambda x: x>0).map(currency_mx).shuffle(2).batch(2)
for sales in tf_dataset_processed:
    print(sales.numpy())

[440 620]
[420  20]
[640 480]
[620]


# Prepare Images 

In [11]:
# Create image dataet without shuffle
image_ds = tf.data.Dataset.list_files("../datasets/pets/*/*", shuffle=False)
for file in image_ds.take(20):
    print(file.numpy())

b'../datasets/pets/cat/Cartoon Cat_ im\xc3\xa1genes_ fotos de stock y vectores_yy.jpg'
b'../datasets/pets/cat/Cat _ Traductor de ingl\xc3\xa9s a espa\xc3\xb1ol - ingl\xc3\xa9s.com.jpg'
b'../datasets/pets/cat/Cat definici\xc3\xb3n y significado _ Diccionario Ingl\xc3\xa9s C.jpg'
b'../datasets/pets/cat/Cat returns home after family believe they cremated.jpg'
b'../datasets/pets/cat/Grumpy Cat - Wikipedia_ la enciclopedia libre.jpg'
b'../datasets/pets/cat/Grumpy Cat_ muere la leyenda felina de internet_yyt.jpg'
b'../datasets/pets/cat/How the Cat Gets Its Stripes_ It_s Genetics__yythkg.jpg'
b'../datasets/pets/cat/cat en espa\xc3\xb1ol _ Traductor ingl\xc3\xa9s-espa\xc3\xb1ol _ Nglish_.jpg'
b'../datasets/pets/dog/45 Best Large Dog Breeds - Top Big Dogs_yyth.jpg'
b'../datasets/pets/dog/9 Reasons to Own a Dog.jpg'
b'../datasets/pets/dog/Dog _ Traductor de ingl\xc3\xa9s a espa\xc3\xb1ol - ingl\xc3\xa9s.com.jpg'
b'../datasets/pets/dog/Dog breeds_ Most popular in America.jpg'
b'../datasets/pets/d

In [12]:
# Shuffle
image_ds = image_ds.shuffle(200)
for file in image_ds.take(3):
    print(file.numpy())

b'../datasets/pets/dog/45 Best Large Dog Breeds - Top Big Dogs_yyth.jpg'
b'../datasets/pets/dog/Dog _ Traductor de ingl\xc3\xa9s a espa\xc3\xb1ol - ingl\xc3\xa9s.com.jpg'
b'../datasets/pets/dog/dog-puppy-on-garden-royalty-free-image-1586966191 - Soolide.jpg'


In [13]:
# Now less divide
class_names = ["cat", "dog"]
image_count = len(image_ds)
print("image_count", image_count)

# size of training dataset
train_size = int(image_count * 0.8)
print("train_size", train_size)

# Create the training dataset
train_ds = image_ds.take(train_size)
test_ds = image_ds.skip(train_size) #skip is the opposite of take

print(f'size of training dataset:{len(train_ds)}, size of test dataset:{len(test_ds)}' )

image_count 17
train_size 13
size of training dataset:13, size of test dataset:4


In [14]:
import os
def get_label(path): #path is a tensor
    return tf.strings.split(path, os.path.sep)[3]

s='../datasets/pets/cat/Cat returns home after family believe they cremated.jpg'
label = get_label(s)
label

<tf.Tensor: shape=(), dtype=string, numpy=b'cat'>

In [15]:
for item in train_ds.map(get_label):
    print(item)

tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)


In [18]:
import numpy as np
def process_image(filepath):
    label = get_label(filepath)
    image = tf.io.read_file(filepath)
    #decode jpg image
    image = tf.image.decode_jpeg(image)
    #resize
    #mage = tf.image.resize(image, [128,128])
    return image, label

for (image, label) in train_ds.map(process_image):
    print(f'label: {label}')
    print(f'image_shape: {np.shape(image.numpy())}')


label: b'dog'
image_shape: (1000, 825, 3)
label: b'dog'
image_shape: (635, 640, 3)
label: b'cat'
image_shape: (280, 310, 3)
label: b'cat'
image_shape: (886, 1000, 3)
label: b'dog'
image_shape: (455, 600, 3)
label: b'dog'
image_shape: (1437, 2560, 3)
label: b'dog'
image_shape: (254, 380, 3)
label: b'dog'
image_shape: (1414, 2121, 3)
label: b'dog'
image_shape: (216, 324, 3)
label: b'cat'
image_shape: (360, 640, 3)
label: b'dog'
image_shape: (602, 1200, 3)
label: b'cat'
image_shape: (253, 380, 3)
label: b'dog'
image_shape: (825, 800, 3)


# Exercise with textfiles

In [75]:
# Create a dataset
texts_ds = tf.data.Dataset.list_files("../datasets/ex01/reviews/*/*.txt")
for text in texts_ds:
    print(text)


tf.Tensor(b'../datasets/ex01/reviews/positive/pos_3.txt', shape=(), dtype=string)
tf.Tensor(b'../datasets/ex01/reviews/negative/neg_1.txt', shape=(), dtype=string)
tf.Tensor(b'../datasets/ex01/reviews/negative/neg_3.txt', shape=(), dtype=string)
tf.Tensor(b'../datasets/ex01/reviews/positive/pos_1.txt', shape=(), dtype=string)
tf.Tensor(b'../datasets/ex01/reviews/positive/pos_2.txt', shape=(), dtype=string)
tf.Tensor(b'../datasets/ex01/reviews/negative/neg_2.txt', shape=(), dtype=string)


# Filter -> remove empty text datasets

In [62]:
# filtering has a problem: https://github.com/tensorflow/tensorflow/issues/46685
# Basically you can't transform to numpy()
def to_numpy(x):
    print(x)
    return x.numpy()

# You can't do this
#texts_ds = texts_ds.map(to_numpy)

In [89]:
def process_texts_dataset(filepath):    
    # get label
    label = tf.strings.split(filepath, os.path.sep)[4]
    #read file
    text = tf.io.read_file(filepath)    
    return (label, text)

# Extract: create labels and reviews
reviews_labels_ds = texts_ds.map(process_texts_dataset)    
for (label, review) in reviews_labels_ds:
    print(f'label----->{label}, size: {len(review.numpy())}')

print("AFTER FILTER")
# filter here
reviews_labels_ds = reviews_labels_ds.filter(lambda label, review : review!='')
for (label, review) in reviews_labels_ds:
    print(f'label----->{label}, size: {len(review.numpy())}')

label----->b'negative', size: 0
label----->b'positive', size: 999
label----->b'negative', size: 935
label----->b'positive', size: 0
label----->b'positive', size: 1762
label----->b'negative', size: 749
AFTER FILTER
label----->b'positive', size: 999
label----->b'negative', size: 935
label----->b'negative', size: 749
label----->b'positive', size: 1762


5