In [1]:
import os

import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable = True)

In [3]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin = DIRECTORY_URL + name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

'/home/tanmay/.keras/datasets'

Iterate through the files, loading each one into its own dataset. <br/>

Each example needs to be individually labeled, so use tf.data.Dataset.map to apply a labeler function to each one. <br/>
This will iterate over every example in the dataset, returning (example, label) pairs.

In [4]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [5]:
# Params

BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [6]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration = False)

In [7]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b"With haughty stride; a pond'rous boarding-pike,">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Approach'd the shore, to Ilium back he came;">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'On this his mind became clouded; his limbs failed him, and he stood as'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'the others be, but took the cup offered her by lovely Themis, who was'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"The other, brave Menestheus' trusted friend.">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [8]:
# Tokenize

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

In [9]:
# Encode examples
# Create an encoder by passing the vocabulary_set to tfds.features.text.TokenTextEncoder. 
# The encoder's encode method takes in a string of text and returns a list of integers.

In [10]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [11]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b"With haughty stride; a pond'rous boarding-pike,"


In [12]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[7365, 1397, 8725, 5314, 1374, 17084, 10005, 12438]


In [13]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

You want to use Dataset.map to apply this function to each element of the dataset. <br/>
Dataset.map runs in graph mode.<br/>
Graph tensors do not have a value.<br/>
In graph mode you can only use TensorFlow Ops and functions.<br/>
So you can't .map this function directly.<br/>
You need to wrap it in a tf.py_function. The tf.py_function will pass regular tensors (with a value and a .numpy() method to access it), to the wrapped python function.

In [14]:
def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp = [text, label], 
                                       Tout = (tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually
    
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)

In [15]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes = ([None],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes = ([None],[]))

In [16]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([ 7365,  1397,  8725,  5314,  1374, 17084, 10005, 12438,     0,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>)

In [17]:
# Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.

vocab_size += 1

## Build Model

In [18]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

model.add(tf.keras.layers.Dense(3))

In [19]:
model.compile(optimizer = 'adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
              metrics = ['accuracy'])

In [20]:
model.fit(train_data, epochs = 3, validation_data = test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f2db056df10>

In [21]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

     79/Unknown - 3s 37ms/step - loss: 0.4332 - accuracy: 0.8338
Eval loss: 0.433, Eval accuracy: 0.834
