In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os

### Create a simple dataset

In [2]:
x = np.zeros((100, 10, 2, 2))

In [3]:
# Create a dataset from the tensor x

dataset1 = tf.data.Dataset.from_tensor_slices(x)

In [4]:
# Inspect the Dataset object

print(dataset1)
print(dataset1.element_spec)

<TensorSliceDataset shapes: (10, 2, 2), types: tf.float64>
TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None)


In [5]:
x2 = [np.zeros((10, 2, 2)), np.zeros((5, 2, 2))]

In [8]:
# Try creating a dataset from tensor x2

dataset2 = tf.data.Dataset.from_tensor_slices(x2)

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [9]:
x2 = [np.zeros((10, 1)), np.zeros((10, 1)), np.zeros((10, 1))]

In [10]:
# Create another dataset from the new x2 and inspect the Dataset object

dataset2 = tf.data.Dataset.from_tensor_slices(x2)

In [11]:
# print the element_spec

print(dataset2.element_spec)

TensorSpec(shape=(10, 1), dtype=tf.float64, name=None)


### Create a zipped dataset

In [12]:
# Combine the two datasets into one larget dataset

dataset_zipped = tf.data.Dataset.zip((dataset1, dataset2))

In [13]:
# Print the element_spec

print(dataset_zipped.element_spec)

(TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None), TensorSpec(shape=(10, 1), dtype=tf.float64, name=None))


In [14]:
# Define a function to find the number of batches in a dataset

def get_batches(dataset):
    iter_dataset = iter(dataset)
    i = 0
    try:
        while next(iter_dataset):
            i = i+1
    except:
        return i

In [15]:
# Find the number of batches in the zipped Dataset

get_batches(dataset_zipped)

3

### Create a dataset from numpy arrays

In [16]:
# Load the MNIST dataset

(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.mnist.load_data()

print(type(train_features), type(train_labels))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [17]:
# Create a Dataset from the MNIST data

mnist_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))

In [18]:
# Inspect the Dataset object

print(mnist_dataset.element_spec)

(TensorSpec(shape=(28, 28), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))


In [19]:
# Inspect the length of an element using the take method

element = next(iter(mnist_dataset.take(1)))
print(len(element))

2


In [20]:
# Examine the shapes of the data

print(element[0].shape)
print(element[1].shape)

(28, 28)
()


### Create a dataset from text data

In [25]:
# Print the list of text files

text_files = sorted([f.path for f in os.scandir('data/shakespeare-db')])

print(text_files)

["data/shakespeare-db/All's Well That Ends Well.txt", 'data/shakespeare-db/Antony and Cleopatra.txt', 'data/shakespeare-db/As You Like It.txt', 'data/shakespeare-db/Comedy of Errors.txt', 'data/shakespeare-db/Coriolanus.txt', 'data/shakespeare-db/Cymbeline.txt', 'data/shakespeare-db/Hamlet.txt', 'data/shakespeare-db/Henry IV, part 1.txt', 'data/shakespeare-db/Henry IV, part 2.txt', 'data/shakespeare-db/Henry V.txt', 'data/shakespeare-db/Henry VI, part 1.txt', 'data/shakespeare-db/Henry VI, part 2.txt', 'data/shakespeare-db/Henry VI, part 3.txt', 'data/shakespeare-db/Henry VIII.txt', 'data/shakespeare-db/Julius Caesar.txt', 'data/shakespeare-db/King John.txt', 'data/shakespeare-db/King Lear.txt', "data/shakespeare-db/Love's Labour's Lost.txt", 'data/shakespeare-db/Macbeth.txt', 'data/shakespeare-db/Measure for Measure.txt', 'data/shakespeare-db/Merchant of Venice.txt', 'data/shakespeare-db/Merry Wives of Windsor.txt', "data/shakespeare-db/Midsummer Night's Dream.txt", 'data/shakespeare-

In [27]:
# Load the first file using python and print the first 5 lines

with open(text_files[0], 'r') as fil:
    contents = [fil.readline() for i in range(10)]
    for line in contents:
        print(line)









                           All's Well That Ends Well

Shakespeare_homepage | All's_Well_That_Ends_Well | Entire play

**** ACT I ****

**** SCENE I. Rousillon. The COUNT's palace. ****

     Enter BERTRAM, the COUNTESS of Rousillon, HELENA, and LAFEU, all in

     black



In [28]:
# Load the lines from the files into a dataset using TextLineDataset

shakespeare_dataset = tf.data.TextLineDataset(text_files)

In [30]:
# Use the take method to get and print the first 5 lines of the dataset

first_5_lines_dataset = iter(shakespeare_dataset.take(10))
lines = [line for line in first_5_lines_dataset]
for line in lines:
    print(line)

tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b"                           All's Well That Ends Well", shape=(), dtype=string)
tf.Tensor(b"Shakespeare_homepage | All's_Well_That_Ends_Well | Entire play", shape=(), dtype=string)
tf.Tensor(b'**** ACT I ****', shape=(), dtype=string)
tf.Tensor(b"**** SCENE I. Rousillon. The COUNT's palace. ****", shape=(), dtype=string)
tf.Tensor(b'     Enter BERTRAM, the COUNTESS of Rousillon, HELENA, and LAFEU, all in', shape=(), dtype=string)
tf.Tensor(b'     black', shape=(), dtype=string)


In [31]:
# Compute the number of lines in the first file

lines = []
with open(text_files[0], 'r') as fil:
    line = fil.readline()
    while line:
        lines.append(line)
        line = fil.readline()
    print(len(lines))

4032


In [32]:
# Compute the number of lines in the shakespeare dataset we created

shakespeare_dataset_iterator = iter(shakespeare_dataset)
lines = [line for line in shakespeare_dataset_iterator]
print(len(lines))

142180


### Interleave lines from the text data files

In [34]:
# Create a dataset of the text file strings

text_files_dataset = tf.data.Dataset.from_tensor_slices(text_files)
files = [file for file in text_files_dataset]
for file in files:
    print(file)

tf.Tensor(b"data/shakespeare-db/All's Well That Ends Well.txt", shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Antony and Cleopatra.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/As You Like It.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Comedy of Errors.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Coriolanus.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Cymbeline.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Hamlet.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry IV, part 1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry IV, part 2.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry V.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry VI, part 1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry VI, part 2.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare-db/Henry VI, part 3.txt', shape=(), dtype=st

In [35]:
len(files)

36

In [36]:
# Interleave the lines from the text files

interleaved_shakespeare_dataset = text_files_dataset.interleave(tf.data.TextLineDataset, cycle_length=36)

In [38]:
print(interleaved_shakespeare_dataset.element_spec)

TensorSpec(shape=(), dtype=tf.string, name=None)


In [41]:
# Print the first 10 elements of the interleaved dataset

lines = [line for line in iter(interleaved_shakespeare_dataset.take(500))]
for line in lines:
    print(line) # print sequentially the lines of each file: 1st print-> 1st line of 1st file, 2nd print-> 1st line of second file [...]

tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'', shape=(), 