# Method to load text data into a TF Dataset

Similar to Pandas read_csv() to load text data into a data frame like structure.


# TextLineDataset

* [tf.data.TextLineDataset](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset0)

Not limited to a single file but also from multple files.

> A Dataset comprising lines from one or more text file**s**.

In [2]:
import sys
import pathlib
import itertools
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=200) 

## PTB text

In [3]:
f = "ptb.train.txt"
path_to_ptb = tf.keras.utils.get_file(
    f,
    f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/{f}'
)

## Load text into dataset

In [5]:
ptb_ds = tf.data.TextLineDataset(
    filenames=path_to_ptb, compression_type=None, buffer_size=None, num_parallel_reads=True
)\
.filter(lambda x: tf.cast(tf.strings.length(x), bool))\
.shuffle(10000)

In [6]:
iterator = iter(ptb_ds)
first = next(iterator).numpy()
print(first)

b" in fact he <unk> the u.s. from one of the world 's most corrupt organizations unesco "


---
# Example

Generate a set of word indices using PTB and genereate word index sequence from the shakespeare.txt.

## Generate the word indices from PTB

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [8]:
# --------------------------------------------------------------------------------
# Fit to the words in the corpus
# --------------------------------------------------------------------------------
vectorizer = TextVectorization(
    output_mode="int",
    ngrams=None           # 1 word = 1 token
)
vectorizer.adapt(ptb_ds)

## Shakespeare text

In [9]:
path_to_shakespeare = tf.keras.utils.get_file(
    'shakespeare.txt', 
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)

In [50]:
shakespeare_text_ds = tf.data.TextLineDataset(path_to_shakespeare)\
    .filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [51]:
shakespeare_iterator = shakespeare_text_ds.as_numpy_iterator()
for line in itertools.islice(shakespeare_iterator, 10):
    print(line)

b'First Citizen:'
b'Before we proceed any further, hear me speak.'
b'All:'
b'Speak, speak.'
b'First Citizen:'
b'You are all resolved rather to die than to famish?'
b'All:'
b'Resolved. resolved.'
b'First Citizen:'
b'First, you know Caius Marcius is chief enemy to the people.'


## Convert Shakespeare dataset into word index sequence

In [52]:
shakespeare_vector_ds = shakespeare_text_ds.batch(1024).prefetch(tf.data.AUTOTUNE).map(vectorizer).unbatch()
# shakespeare_vector_ds = vectorizer(shakespeare_ds.batch(128).prefetch(tf.data.AUTOTUNE))

In [77]:
first_line = list(itertools.islice(shakespeare_vector_ds, 3))[0]
second_line = list(itertools.islice(shakespeare_vector_ds, 3))[1]
third_line = list(itertools.islice(shakespeare_vector_ds, 3))[2]
print(f"word index sequence of the corpus:\n{first_line.numpy()}\n")

word index sequence of the corpus:
[  75 6613    0    0    0    0    0    0    0    0    0    0]



In [86]:
vocabulary = vectorizer.get_vocabulary()
print(tf.gather_nd(vocabulary, indices=[first_line.numpy()[::, np.newaxis]]).numpy())
print(tf.gather_nd(vocabulary, indices=[second_line.numpy()[::, np.newaxis]]).numpy())
print(tf.gather_nd(vocabulary, indices=[third_line.numpy()[::, np.newaxis]]).numpy())

[[b'first' b'citizen' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'']]
[[b'before' b'we' b'proceed' b'any' b'further' b'hear' b'me' b'speak' b'' b'' b'' b'']]
[[b'all' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'']]
