# 11.2 Preparing Text Data
- We can use TextVectorization in Keras.
- There are 3 steps in vectorization:
    - Standardization.
    - Split (Tokenize).
    - Indexing.

In [1]:
import re
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [2]:
def custom_standardization_fn(string_tensor):
    # convert to lower case
    lowercase_string = tf.strings.lower(string_tensor)
    
    # replace punctuation character with empty string
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]", ""
    )

def custom_split_fn(string_tensor):
    # split strings on whitespaces
    return tf.strings.split(string_tensor)

In [3]:
text_vectorization = TextVectorization(
    output_mode="int",
    standardize=custom_standardization_fn,
    split=custom_split_fn,
)

2023-02-22 03:38:07.620965: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64::/opt/conda/lib
2023-02-22 03:38:07.621043: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)


In [4]:
dataset = [
 "I write, erase, rewrite",
 "Erase again, and then",
 "A poppy blooms.",
]

In [5]:
# adapt to the dataset
text_vectorization.adapt(dataset)

In [6]:
# display vocabilary
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [7]:
# let's go thru example of encode and decode a sentence
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [8]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again
