# tf.data API

In [19]:
import tensorflow as tf

In [20]:
daily_sales_numbers = [21,22,-108, 31, 1, 32,24,31]

# Simple tf.data.Dataset object

- **from_tensor_slices** creates a dataset with a separate element  for each row of the input tensor
- **from_tensors** Combines the input and returns a dataset with a single element


In [21]:
# from tensors
array1 = [21,22,-108, 31, 1, 32,24,31]
from_tensors_dataset = tf.data.Dataset.from_tensors(array1)
print(from_tensors_dataset)
array2 = [[21,22,-108, 31, 1, 32,24,31],[21,22,-108, 31, 1, 32,24,31]]
from_tensors_dataset2 = tf.data.Dataset.from_tensors(array2)
print(from_tensors_dataset2)

<TensorDataset shapes: (8,), types: tf.int32>
<TensorDataset shapes: (2, 8), types: tf.int32>


In [22]:
tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [23]:
print("you can iterate through out your dataset")
for sales in tf_dataset:
    print(sales.numpy())

print("you can use a numpy iterator")
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

print("you can select only a number of elements")
for sales in tf_dataset.take(3):
    print(sales)
    


you can iterate through out your dataset
21
22
-108
31
1
32
24
31
you can use a numpy iterator
21
22
-108
31
1
32
24
31
you can select only a number of elements
tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(-108, shape=(), dtype=int32)


## Filter values from your dataset

In [24]:
print("Filter negative values")
tf_dataset_non_negative = tf_dataset.filter(lambda x: x>0)
for sales in tf_dataset_non_negative:
    print(sales)

Filter negative values
tf.Tensor(21, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)
tf.Tensor(24, shape=(), dtype=int32)
tf.Tensor(31, shape=(), dtype=int32)


# Map values from your dataset

In [25]:
def currency_mx(input_val):
    return input_val * 20

tf_dataset_mx = tf_dataset_non_negative.map(currency_mx)
for sales in tf_dataset_mx:
    print(sales)

tf.Tensor(420, shape=(), dtype=int32)
tf.Tensor(440, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(640, shape=(), dtype=int32)
tf.Tensor(480, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)


# Shuffle
how it works: https://stackoverflow.com/questions/53514495/what-does-batch-repeat-and-shuffle-do-with-tensorflow-dataset

In [26]:
tf_dataset_shuffled = tf_dataset_mx.shuffle(3)
for sales in tf_dataset_shuffled:
    print(sales)

tf.Tensor(420, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)
tf.Tensor(480, shape=(), dtype=int32)
tf.Tensor(440, shape=(), dtype=int32)
tf.Tensor(620, shape=(), dtype=int32)
tf.Tensor(640, shape=(), dtype=int32)


# Batching
Great for distributed environments

In [27]:
tf_dataset_batched = tf_dataset_mx.batch(3)
for sales_batch in tf_dataset_batched:
    print(sales_batch.numpy())

[420 440 620]
[ 20 640 480]
[620]


# Putting everything together

In [28]:
tf_dataset_processed = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset_processed = tf_dataset_processed.filter(lambda x: x>0).map(currency_mx).shuffle(2).batch(2)
for sales in tf_dataset_processed:
    print(sales.numpy())

[420 620]
[440  20]
[480 640]
[620]


# Prepare Images 

In [42]:
# Create image dataet without shuffle
image_ds = tf.data.Dataset.list_files("../datasets/pets/*/*.jpg", shuffle=False)
for file in image_ds.take(20):
    print(file.numpy())

b'../datasets/pets/cat/cat_01.jpg'
b'../datasets/pets/cat/cat_02.jpg'
b'../datasets/pets/cat/cat_03.jpg'
b'../datasets/pets/cat/cat_04.jpg'
b'../datasets/pets/cat/cat_05.jpg'
b'../datasets/pets/cat/cat_06.jpg'
b'../datasets/pets/cat/cat_07.jpg'
b'../datasets/pets/cat/cat_08.jpg'
b'../datasets/pets/dog/dog_01.jpg'
b'../datasets/pets/dog/dog_02.jpg'
b'../datasets/pets/dog/dog_03.jpg'
b'../datasets/pets/dog/dog_04.jpg'
b'../datasets/pets/dog/dog_05.jpg'
b'../datasets/pets/dog/dog_06.jpg'
b'../datasets/pets/dog/dog_07.jpg'
b'../datasets/pets/dog/dog_08.jpg'
b'../datasets/pets/dog/dog_09.jpg'


In [43]:
# Shuffle
image_ds = image_ds.shuffle(200)
for file in image_ds.take(3):
    print(file.numpy())

b'../datasets/pets/dog/dog_07.jpg'
b'../datasets/pets/dog/dog_08.jpg'
b'../datasets/pets/dog/dog_02.jpg'


In [44]:
# Now less divide
class_names = ["cat", "dog"]
image_count = len(image_ds)
print("image_count", image_count)

# size of training dataset
train_size = int(image_count * 0.8)
print("train_size", train_size)

# Create the training dataset
train_ds = image_ds.take(train_size)
test_ds = image_ds.skip(train_size) #skip is the opposite of take

print(f'size of training dataset:{len(train_ds)}, size of test dataset:{len(test_ds)}' )

image_count 17
train_size 13
size of training dataset:13, size of test dataset:4


In [45]:
import os
def get_label(path): #path is a tensor
    return tf.strings.split(path, os.path.sep)[3]

s='../datasets/pets/cat/Cat returns home after family believe they cremated.jpg'
label = get_label(s)
label

<tf.Tensor: shape=(), dtype=string, numpy=b'cat'>

In [46]:
for item in train_ds.map(get_label):
    print(item)

tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(b'cat', shape=(), dtype=string)


In [47]:
import numpy as np
def process_image(filepath):
    label = get_label(filepath)
    image = tf.io.read_file(filepath)
    #decode jpg image
    image = tf.image.decode_jpeg(image)
    #resize
    #mage = tf.image.resize(image, [128,128])
    return image, label

for (image, label) in train_ds.map(process_image):
    print(f'label: {label}')
    print(f'image_shape: {np.shape(image.numpy())}')


label: b'dog'
image_shape: (825, 800, 3)
label: b'dog'
image_shape: (254, 380, 3)
label: b'cat'
image_shape: (280, 310, 3)
label: b'cat'
image_shape: (253, 380, 3)
label: b'dog'
image_shape: (1414, 2121, 3)
label: b'cat'
image_shape: (726, 982, 3)
label: b'dog'
image_shape: (1437, 2560, 3)
label: b'dog'
image_shape: (602, 1200, 3)
label: b'cat'
image_shape: (886, 1000, 3)
label: b'cat'
image_shape: (360, 640, 3)
label: b'cat'
image_shape: (216, 324, 3)
label: b'dog'
image_shape: (216, 324, 3)
label: b'cat'
image_shape: (1546, 1118, 3)


# Exercise with textfiles

In [None]:
# Create a dataset
texts_ds = tf.data.Dataset.list_files("../datasets/ex01/reviews/*/*.txt")
for text in texts_ds:
    print(text)


# Filter -> remove empty text datasets

In [None]:
# filtering has a problem: https://github.com/tensorflow/tensorflow/issues/46685
# Basically you can't transform to numpy()
def to_numpy(x):
    print(x)
    return x.numpy()

# You can't do this
#texts_ds = texts_ds.map(to_numpy)

In [None]:
def process_texts_dataset(filepath):    
    # get label
    label = tf.strings.split(filepath, os.path.sep)[4]
    #read file
    text = tf.io.read_file(filepath)    
    return (label, text)

# Extract: create labels and reviews
reviews_labels_ds = texts_ds.map(process_texts_dataset)    
for (label, review) in reviews_labels_ds:
    print(f'label----->{label}, size: {len(review.numpy())}')

print("AFTER FILTER")
# filter here
reviews_labels_ds = reviews_labels_ds.filter(lambda label, review : review!='')
for (label, review) in reviews_labels_ds:
    print(f'label----->{label}, size: {len(review.numpy())}')