<a href="https://colab.research.google.com/github/nigoda/machine_learning/blob/main/05_BuildingPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
dataset = tf.data.Dataset.from_tensor_slices([5,5,4,1,5,2])
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [None]:
for elem in dataset:
  print(elem.numpy())

5
5
4
1
5
2


In [None]:
it = iter(dataset)
print(next(it).numpy())

5


In [None]:
print(dataset.reduce(0, lambda state, value : state + value).numpy())

22


### **Dataset structure**

In [None]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4,10]))
dataset1.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [None]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
    tf.random.uniform([4,100],maxval=100, dtype=tf.int32))
)
dataset2.element_spec

(TensorSpec(shape=(), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [None]:
dataset3 = tf.data.Dataset.zip((dataset1,dataset2))
dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [None]:
# Dataset containing a sparse tensor
dataset4 = tf.data.Dataset.from_tensors(tf.SparseTensor(indices=[[0,0],[1,2]], values=[1,2], dense_shape=[3,4]))

dataset4.element_spec


SparseTensorSpec(TensorShape([3, 4]), tf.int32)

In [None]:
#use value_type to see the type of value represented by the element spec
dataset4.element_spec.value_type


tensorflow.python.framework.sparse_tensor.SparseTensor

In [None]:
dataset1 = tf.data.Dataset.from_tensor_slices(
    tf.random.uniform([4,10], minval=1,maxval=10,dtype = tf.int32)
)
dataset1

<TensorSliceDataset shapes: (10,), types: tf.int32>

In [None]:
for z in dataset1:
  print(z.numpy())
 

[1 6 3 4 3 1 3 9 6 5]
[5 2 7 5 3 2 5 7 7 3]
[1 1 7 7 7 2 1 2 3 4]
[5 6 6 7 4 7 6 2 7 7]


In [None]:
Dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]),
    tf.random.uniform([4,100], maxval=100, dtype=tf.int32))
)
dataset2

<TensorSliceDataset shapes: ((), (100,)), types: (tf.float32, tf.int32)>

In [None]:
dataset3=tf.data.Dataset.zip((dataset1,dataset2))
dataset3

<ZipDataset shapes: ((10,), ((), (100,))), types: (tf.int32, (tf.float32, tf.int32))>

In [None]:
for a, (b,c) in dataset3:
  print('shapes: {a.shape}, {b.shape}, {c.shape}'.format(a=a, b=b, c=c))

shapes: (10,), (), (100,)
shapes: (10,), (), (100,)
shapes: (10,), (), (100,)
shapes: (10,), (), (100,)


### **Reading input data**

Consuming Numpy arrays

In [None]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
images, labels = train
images = images/255

dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset

<TensorSliceDataset shapes: ((28, 28), ()), types: (tf.float64, tf.uint8)>

Conssuming Python generators

In [None]:
def count(stop):
  i =0
  while i<stop:
    yield i
    i += 1

In [None]:
for n in count(5):
  print(n)

0
1
2
3
4


In [None]:
ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes=(),)
ds_counter

<FlatMapDataset shapes: (), types: tf.int32>

In [None]:
for count_batch in ds_counter.repeat().batch(10).take(10):
  print(count_batch.numpy())

[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24  0  1  2  3  4]
[ 5  6  7  8  9 10 11 12 13 14]
[15 16 17 18 19 20 21 22 23 24]
[0 1 2 3 4 5 6 7 8 9]
[10 11 12 13 14 15 16 17 18 19]
[20 21 22 23 24  0  1  2  3  4]
[ 5  6  7  8  9 10 11 12 13 14]
[15 16 17 18 19 20 21 22 23 24]


In [None]:
def gen_series():
  i = 0
  while True:
    size = np.random.randint(0,10)
    yield i, np.random.normal(size=(size,))
    i +=1

In [None]:
for i, series in gen_series():
  print(i, ":", str(series))
  if i >5:
    break

0 : [1.54419956 1.41655991]
1 : []
2 : [-1.47177731  1.28122225  0.47611254 -1.09857556  0.53695867  0.13464602]
3 : [0.86423432 1.49255668 2.31316157]
4 : [-0.43337892 -0.83019525 -1.19182    -0.15507808  0.1239392  -0.16081916]
5 : [-1.28066436  0.22036254  2.04078702  0.95411024 -1.06612376  1.30225528]
6 : [ 0.85285236  0.02749249  0.11952706  0.54629505 -0.07288232 -0.35385807]


In [None]:
ds_series = tf.data.Dataset.from_generator(
    gen_series,
    output_types=(tf.int32, tf.float32),
    output_shapes = ((),(None,))  
)
ds_series

<FlatMapDataset shapes: ((), (None,)), types: (tf.int32, tf.float32)>

In [None]:
ds_series_batch = ds_series.shuffle(20).padded_batch(10)

ids,sequence_batch = next(iter(ds_series_batch))
print(ids.numpy())
print()
print(sequence_batch.numpy())

[16  1  7  5 10 23  2  8 20 15]

[[ 0.3087899   1.8356315   1.577824   -0.3799068  -0.32352236  2.6610785
  -0.3367504   0.        ]
 [ 0.49712804  1.3229223   1.1679293   0.2135402   1.0732572  -0.34370953
  -0.6063715   0.        ]
 [ 0.03957643 -1.3862811  -0.5884637   0.2028329   1.0666182  -0.41139233
   0.          0.        ]
 [ 1.8360199   1.7281536  -0.53105974  0.43020478 -1.5893319   1.3242542
   0.          0.        ]
 [-0.4001853   0.01381919  0.          0.          0.          0.
   0.          0.        ]
 [ 0.87876946  0.5502658  -2.3261123   0.93581164 -1.008771   -0.98900694
   0.06739069  0.15198636]
 [ 1.9479853   0.1485789   1.0994276  -0.9851562   0.6651141   0.
   0.          0.        ]
 [-0.6084662  -1.4032997   0.65778726  0.          0.          0.
   0.          0.        ]
 [-0.8060331   1.156996   -1.3831948   0.          0.          0.
   0.          0.        ]
 [-0.6371768   2.2786148  -0.71292573 -0.58691925  2.0967684   0.17653255
   0.          0. 

In [None]:
#for a more realistic example, try wrapping preprocessing.image.ImageDataGenerator as a tf.data.Dataset.
#FIRST DOWNLOAD THE DATA:
flowers = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz


In [None]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255,rotation_range = 20)

In [None]:
images, labels = next(img_gen.flow_from_directory(flowers))

Found 3670 images belonging to 5 classes.


In [None]:
print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

float32 (32, 256, 256, 3)
float32 (32, 5)


In [None]:
ds = tf.data.Dataset.from_generator(
    lambda: img_gen.flow_from_directory(flowers),
    output_types = (tf.float32, tf.float32),
    output_shapes= ([32,256,256,3],[32,5]),
)
ds.element_spec

(TensorSpec(shape=(32, 256, 256, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(32, 5), dtype=tf.float32, name=None))

In [None]:
for images, labels in ds.take(1):
  print('images.shapes: ',images.shape)
  print('labels.shapes: ',labels.shape)

Found 3670 images belonging to 5 classes.
images.shapes:  (32, 256, 256, 3)
labels.shapes:  (32, 5)


Consuming TFRecord data

In [None]:
#Creates a dataset that reads all of the examples from two files.
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001


In [None]:
dataset = tf.data.TFRecordDataset(filenames=[fsns_test_file])
dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [None]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())

parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}

Consuming text data

In [None]:
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt','derby.txt','butler.txt']

file_path = [tf.keras.utils.get_file(file_name, directory_url + file_name)
             for file_name in file_names ]

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


In [None]:
dataset = tf.data.TextLineDataset(file_path)

In [None]:
#Here are the first few lines of the first file
for line in dataset.take(5):
  print(line.numpy())

b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b'His wrath pernicious, who ten thousand woes'
b"Caused to Achaia's host, sent many a soul"
b'Illustrious into Ades premature,'
b'And Heroes gave (so stood the will of Jove)'


In [None]:
#To alternate lines between files use Datset.interleave. 
#This makes it easier to shuffle files together. Here are the first,second and third lines from each translation:

files_ds = tf.data.Dataset.from_tensor_slices(file_path)
lines_ds = files_ds.interleave(tf.data.TextLineDataset,cycle_length=3)

for i, line in enumerate(lines_ds.take(9)):
  if i%3 == 0:
    print()
  print(line.numpy())


b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'

b'His wrath pernicious, who ten thousand woes'
b'The vengeance, deep and deadly; whence to Greece'
b'countless ills upon the Achaeans. Many a brave soul did it send'

b"Caused to Achaia's host, sent many a soul"
b'Unnumbered ills arose; which many a soul'
b'hurrying down to Hades, and many a hero did it yield a prey to dogs and'


In [None]:
#titanic data set
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)


In [None]:
for line in titanic_lines.take(10):
  print(line.numpy())

b'survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone'
b'0,male,22.0,1,0,7.25,Third,unknown,Southampton,n'
b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y'
b'0,male,2.0,3,1,21.075,Third,unknown,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


In [None]:
def survived(line):
  return tf.not_equal(tf.strings.substr(line,0,1),"0")
survivors = titanic_lines.skip(1).filter(survived)

In [None]:
for line in survivors.take(10):
  print(line.numpy())

b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'
b'1,male,28.0,0,0,13.0,Second,unknown,Southampton,y'
b'1,female,28.0,0,0,7.225,Third,unknown,Cherbourg,y'
b'1,male,28.0,0,0,35.5,First,A,Southampton,y'
b'1,female,38.0,1,5,31.3875,Third,unknown,Southampton,n'


Consuming CSV Data

In [None]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")

In [None]:
df = pd.read_csv(titanic_file, index_col=None)
df.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [None]:
titanic_slices = tf.data.Dataset.from_tensor_slices(dict(df))

for feature_batch in titanic_slices.take(1):
  for key,value in feature_batch.items():
    print(" {!r:20s}: {}".format(key, value))

 'survived'          : 0
 'sex'               : b'male'
 'age'               : 22.0
 'n_siblings_spouses': 1
 'parch'             : 0
 'fare'              : 7.25
 'class'             : b'Third'
 'deck'              : b'unknown'
 'embark_town'       : b'Southampton'
 'alone'             : b'n'


In [None]:
titanic_batches = tf.data.experimental.make_csv_dataset(
    titanic_file, batch_size = 4,
    label_name="survived"
)

In [None]:
for feature_batch, label_batch in titanic_batches.take(1):
  print("'survived' : {}".format(label_batch))
  print("features:")
  for key, value in feature_batch.items():
    print(" {!r:20s}: {}".format(key, value))

'survived' : [0 0 0 0]
features:
 'sex'               : [b'male' b'male' b'male' b'male']
 'age'               : [34. 34. 46. 28.]
 'n_siblings_spouses': [0 1 0 0]
 'parch'             : [0 1 0 0]
 'fare'              : [ 8.05 14.4  26.    8.05]
 'class'             : [b'Third' b'Third' b'Second' b'Third']
 'deck'              : [b'unknown' b'unknown' b'unknown' b'unknown']
 'embark_town'       : [b'Southampton' b'Southampton' b'Southampton' b'Southampton']
 'alone'             : [b'y' b'n' b'y' b'y']


In [None]:
titanic_batches = tf.data.experimental.make_csv_dataset(
    titanic_file, batch_size=4,
    label_name = "survived", select_columns=['class','fare','survived']
)

In [None]:
for feature_batch, label_batch in titanic_batches.take(1):
  print("'survived :' {}".format(label_batch))
  for key, value in feature_batch.items():
    print(" {!r:20s}: {}".format(key, value))

'survived :' [0 0 1 0]
 'fare'              : [108.9    8.05  79.2   69.55]
 'class'             : [b'First' b'Third' b'First' b'Third']


In [None]:
titanic_types = [tf.int32, tf.string, tf.float32, tf.int32, tf.int32, tf.float32, tf.string, tf.string, tf.string, tf.string]
dataset = tf.data.experimental.CsvDataset(titanic_file, titanic_types, header=True)

for line in dataset.take(10):
  print([item.numpy() for item in line])

[0, b'male', 22.0, 1, 0, 7.25, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 38.0, 1, 0, 71.2833, b'First', b'C', b'Cherbourg', b'n']
[1, b'female', 26.0, 0, 0, 7.925, b'Third', b'unknown', b'Southampton', b'y']
[1, b'female', 35.0, 1, 0, 53.1, b'First', b'C', b'Southampton', b'n']
[0, b'male', 28.0, 0, 0, 8.4583, b'Third', b'unknown', b'Queenstown', b'y']
[0, b'male', 2.0, 3, 1, 21.075, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 27.0, 0, 2, 11.1333, b'Third', b'unknown', b'Southampton', b'n']
[1, b'female', 14.0, 1, 0, 30.0708, b'Second', b'unknown', b'Cherbourg', b'n']
[1, b'female', 4.0, 1, 1, 16.7, b'Third', b'G', b'Southampton', b'n']
[0, b'male', 20.0, 0, 0, 8.05, b'Third', b'unknown', b'Southampton', b'y']


In [None]:
%%writefile missing.csv
1,2,3,4
,2,3,4
1,,3,4
1,2,,4
1,2,3,
,,,


Writing missing.csv


In [None]:
#Creates a dataset that reads all of the records from two CSV files, each with
#four float columns which have missing value.

record_defaults = [999,999,999,999]
dataset = tf.data.experimental.CsvDataset("missing.csv", record_defaults)
dataset = dataset.map(lambda*items: tf.stack(items))
dataset

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'ValueError' object has no attribute 'lineno'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'ValueError' object has no attribute 'lineno'


<MapDataset shapes: (4,), types: tf.int32>

In [None]:
for line in dataset:
  print(line.numpy())

[1 2 3 4]
[999   2   3   4]
[  1 999   3   4]
[  1   2 999   4]
[  1   2   3 999]
[999 999 999 999]


In [None]:
# Creates a dataset that reads all of the record from two CSV files with
# header, extracting float data from columns 2 and 4.
record_defaults = [999,999] #Only provide defaults for the selected columns
dataset = tf.data.experimental.CsvDataset("missing.csv", record_defaults, select_cols=[1,3])
dataset = dataset.map(lambda *items: tf.stack(items))
dataset

<MapDataset shapes: (2,), types: tf.int32>

In [None]:
for line in dataset:
  print(line.numpy())

[2 4]
[2 4]
[999   4]
[2 4]
[  2 999]
[999 999]


Consuming sets of files

In [None]:
flowers_root = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)
flowers_root = pathlib.Path(flowers_root)

In [None]:
for item in flowers_root.glob('*'):
  print(item.name)

daisy
roses
sunflowers
dandelion
tulips
LICENSE.txt


In [None]:
list_ds = tf.data.Dataset.list_files(str(flowers_root/'*/*'))

for f in list_ds.take(5):
  print(f.numpy())

b'/root/.keras/datasets/flower_photos/dandelion/5673112305_02fe19297b_n.jpg'
b'/root/.keras/datasets/flower_photos/sunflowers/184682320_73ccf74710.jpg'
b'/root/.keras/datasets/flower_photos/daisy/476856232_7c35952f40_n.jpg'
b'/root/.keras/datasets/flower_photos/dandelion/425800274_27dba84fac_n.jpg'
b'/root/.keras/datasets/flower_photos/dandelion/2596413098_7ef69b7e1d_m.jpg'


In [None]:
#Read the data using the tf.io.read_file function and extract the label from the
#path, returing (image, label) pairs:

def process_path(file_path):
  label = tf.strings.split(file_path, os.sep)[-2]
  return tf.io.read_file(file_path), label

labeled_ds = list_ds.map(process_path)

In [None]:
for image_raw, label_text in labeled_ds.take(1):
  print(repr(image_raw.numpy()[:100]))
  print()
  print(label_text.numpy())


b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x03\x02\x02\x03\x03\x03\x03\x04\x03\x03\x04\x05\x08\x05\x05\x04\x04\x05\n\x07\x07\x06\x08\x0c\n\x0c\x0c\x0b\n\x0b\x0b\r\x0e\x12\x10\r\x0e\x11\x0e\x0b\x0b\x10\x16\x10\x11\x13\x14\x15\x15\x15\x0c\x0f\x17\x18\x16\x14\x18\x12\x14\x15\x14\xff\xdb\x00C\x01\x03\x04\x04\x05\x04\x05'

b'roses'
