In [1]:
### LOAD text file INTO A tf.Dataset (TextLineDataset)
# (From a text file in which each example is a line of text)
# -> Usefull for any text data that is line-based

# Data: 3 different translations of Homer's Illiad
# 1st translation from William Cowper
# 2nd translation from Edward, Earl of Derby
# 3rd translation from Samuel Butler

# Goal: Identify the translator given a single line of text

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [3]:
### Download the datasets

DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    # Download the 3 files, keep track of the path to extract the parent path later on
    temp_path = tf.keras.utils.get_file(name, origin = DIRECTORY_URL + name)

parent_directory = os.path.dirname(temp_path)
print('Parent directory: {}'.format(parent_directory))

Parent directory: /Users/nicolas/.keras/datasets


In [4]:
### Load the text into datasets
# Load each file into its own dataset

# In each dataset, each example(line) needs to be labeled
# To do so, we'll use the Dataset.map function to apply a labeler function to each example.

def labeler(example, index):
    ''' Given an example, return a tuple (example, label)
    '''
    return example, tf.cast(index, tf.int64)

labeled_datasets = []

for index, file_name in enumerate(FILE_NAMES):
    text_dataset = tf.data.TextLineDataset(os.path.join(parent_directory, file_name)) # (Take the text file path as argurment)
    labeled_dataset = text_dataset.map(lambda example: labeler(example, index))
    labeled_datasets.append(labeled_dataset)
    
# .map vs .apply: The difference is that MAP will execute one function on every element of the Dataset separately,
# whereas APPLY will execute one function on the whole Dataset at once.
# The argument of APPLY is a function that takes a Dataset and returns a Dataset
# when the argument of MAP is a function that takes one element and returns one transformed element.

In [5]:
# Let's explore the labeled datasets
print('Number of labeled datasets: ', len(labeled_datasets))
print('-')
print('1st example of the 1st dataset: ', next(iter(labeled_datasets[1]))[0].numpy())
print('1st label of the 1st dataset: ', next(iter(labeled_datasets[1]))[1].numpy())

print('Number of example in the 1st dataset: ', len(list(labeled_datasets[0])))

Number of labeled datasets:  3
-
1st example of the 1st dataset:  b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
1st label of the 1st dataset:  1
Number of example in the 1st dataset:  19143


In [6]:
# Combine the 3 labeled dataset into a single one, and suffle it

BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [7]:
# Concatenate the datasets with the 1st one
labeled_dataset = labeled_datasets[0]

for dataset in labeled_datasets[1:]:
    labeled_dataset = labeled_dataset.concatenate(dataset)
    
labeled_dataset = labeled_dataset.shuffle(BUFFER_SIZE, reshuffle_each_iteration = False)
# reshuffle_each_iteration -> indicates that the dataset should be reshuffled each time it is iterated over.

In [8]:
# Let's explore the labeled_dataset

for example in labeled_dataset.take(5):
    print(example)

(<tf.Tensor: id=38379, shape=(), dtype=string, numpy=b"Until it feel th' impelling blast from Heav'n;">, <tf.Tensor: id=38380, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=38381, shape=(), dtype=string, numpy=b"Kiss'd close at every nod, so wedged they stood;">, <tf.Tensor: id=38382, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=38383, shape=(), dtype=string, numpy=b'went spear in hand after Automedon, squire of the fleet descendant of'>, <tf.Tensor: id=38384, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=38385, shape=(), dtype=string, numpy=b"On Ilium's heights, with fat of choicest bulls">, <tf.Tensor: id=38386, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=38387, shape=(), dtype=string, numpy=b'And lustre adding to the arms of Troy.'>, <tf.Tensor: id=38388, shape=(), dtype=int64, numpy=1>)


In [9]:
### Encode text lines as numbers
# Input of NN model are numbers -> The string values need to be converted into lists of numbers
# To do so, we'll map each word to a unique integer (multi-hot encoding via dictionary)

## Create a vocabulary/dictionary
# Tokenize the text into colleciton of unique words

tokenizer = tfds.features.text.Tokenizer() # Splits a string into tokens, and joins them back.

vocabulary_set = set()
for example, _ in labeled_dataset:
    token = tokenizer.tokenize(example.numpy())
    vocabulary_set.update(token)
    
vocab_size = len(vocabulary_set)
print('Vocabulary size: {}'.format(vocab_size))

Vocabulary size: 17178


In [10]:
# Now that we have our dictionary, let's encode each example
# (String of text) -> (list of integer)

# Define the text encoder by passing the vocabulary list
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [11]:
# Let's apply the encoder on a simple example as an illustration
example_text = next(iter(labeled_dataset))[0].numpy()
print('Original example: ', example_text)

encoded_example_text = encoder.encode(example_text)
print('Encoded example: ', encoded_example_text)

# See how to use word embedding rather than multi-hot encoding

Original example:  b"Until it feel th' impelling blast from Heav'n;"
Encoded example:  [897, 16183, 15796, 7670, 6811, 5331, 11721, 513, 4168]


In [19]:
# Let's encode our entire labeled_dataset using Dataset.map method

def encode(example, label):
    return encoder.encode(example.numpy()), label

def encode_map_function(text, label):
    # Wraps a python function into a TensorFlow op that executes it eagerly.
    
    # Operations inside `.map()` run in graph mode and receive a graph tensor that do not have a numpy attribute.
    # The `tokenizer` expects a string or Unicode symbol to encode it into integers.
    # Hence, you need to run the encoding inside a `tf.py_function`,
    # which receives an eager tensor having a numpy attribute that contains the string value.
    return tf.py_function(encode, inp = [text, label], Tout = (tf.int64, tf.int64))

encoded_dataset = labeled_dataset.map(encode_map_function)

In [20]:
# Now let's split the encoded_dataset into train/test batched

# The examples inside a batch need to be the same size and shape.
# It is not the case for the moment. We'll pad the examples so that they have the same length


# skip(count) -> skips count elements from this dataset.
train_set = encoded_dataset.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
# pad the examples and batch the dataset (Pad is performed BEFORE batch) 
# -1 value -> will be padded to the maximum size of that dimension in each batch.
train_set = train_set.padded_batch(BATCH_SIZE, padded_shapes = ([-1], []))

test_set = encoded_dataset.take(TAKE_SIZE)
test_set = test_set.padded_batch(BATCH_SIZE, padded_shapes = ([-1], []))

# Now train_set and test_set are collections of batches.
# Each batch is a pair of (BATCH_SIZE examples x BATCH_SIZE label) represented as arrays

In [24]:
# Let's explore the test_set

# sample_text is an array containing the examples in the 1st batch
# sample_label is an array containing the labels in the 1st batch
sample_text, sample_label = next(iter(test_set))

print('1st example in the test set: ', sample_text[0])
print('-')
print('1st label in the test set: ', sample_label[0])

1st example in the test set:  tf.Tensor(
[  897 16183 15796  7670  6811  5331 11721   513  4168     0     0     0
     0     0     0     0], shape=(16,), dtype=int64)
-
1st label in the test set:  tf.Tensor(1, shape=(), dtype=int64)


In [25]:
# Note that we have introduced a new 'word' or 'token encoding' for padding. Update the vocabulary size

vocab_size += 1

In [27]:
# The train_set and test_set are ready.
### Build a model

# EMBEDDING(64) -> BIDIRECTIONAL(LSTM, output size = 64) -> FC(64) -> RELU -> FC(64) -> RELU -> FC(3) -> SOFTMAX

model = tf.keras.Sequential()

In [28]:
# Convert the multi-hot representation to dense work embedding vectores

model.add(tf.keras.layers.Embedding(vocab_size, 64)) # (input dim = vocab_size, output dim = 64)

In [30]:
# Add a Bidirectional RRN layer with LSTM units

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [31]:
# Add FC(64) layers

model.add(tf.keras.layers.Dense(64, activation = 'relu'))
model.add(tf.keras.layers.Dense(64, activation = 'relu'))

# Add the output layer

model.add(tf.keras.layers.Dense(3, activation = 'softmax'))


In [32]:
# Set the optimizer, loss and metrics

model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

In [33]:
# Train the model

model.fit(train_set, epochs = 3, validation_data = test_set)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x128179dd8>

In [34]:
# Evalute the model on the test_set (which is also the dev set)

test_loss, test_acc = model.evaluate(test_set)

print('TEST SET: ')
print('LOSS: {:.3f}'.format(test_loss))
print('ACCURACY: {:.3f}'.format(test_acc))

TEST SET: 
LOSS: 0.364
ACCURACY: 0.838
