In [1]:
import tensorflow as tf
import numpy as np

In [5]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?',
    'I love to drink coffee and read books in the morning'
    ]

In [6]:
vectorize_layer = tf.keras.layers.TextVectorization()

vectorize_layer.adapt(sentences)

In [7]:
vocabolary = vectorize_layer.get_vocabulary()

for index, word in enumerate(vocabolary):
    print(f"{index}: {word}")

0: 
1: [UNK]
2: my
3: love
4: i
5: dog
6: you
7: to
8: think
9: the
10: read
11: morning
12: is
13: in
14: drink
15: do
16: coffee
17: cat
18: books
19: and
20: amazing


In [8]:
sample_input = 'I love to drink coffee and read books in the morning'

sequence = vectorize_layer([sample_input])

print(sequence)

tf.Tensor([[ 4  3  7 14 16 19 10 18 13  9 11]], shape=(1, 11), dtype=int64)


In [9]:
sequence_post = vectorize_layer(sentences)

print(sequence_post)


tf.Tensor(
[[ 4  3  2  5  0  0  0  0  0  0  0]
 [ 4  3  2 17  0  0  0  0  0  0  0]
 [ 6  3  2  5  0  0  0  0  0  0  0]
 [15  6  8  2  5 12 20  0  0  0  0]
 [ 4  3  7 14 16 19 10 18 13  9 11]], shape=(5, 11), dtype=int64)


In [15]:
sequence_dataset = tf.data.Dataset.from_tensor_slices(sentences)
sequences = sequence_dataset.map(vectorize_layer)
sequence_pre = tf.keras.utils.pad_sequences(sequences, padding='pre')

sequence_pre

2026-02-03 08:28:06.493640: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2026-02-03 08:28:06.503761: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


array([[ 0,  0,  0,  0,  0,  0,  0,  4,  3,  2,  5],
       [ 0,  0,  0,  0,  0,  0,  0,  4,  3,  2, 17],
       [ 0,  0,  0,  0,  0,  0,  0,  6,  3,  2,  5],
       [ 0,  0,  0,  0, 15,  6,  8,  2,  5, 12, 20],
       [ 4,  3,  7, 14, 16, 19, 10, 18, 13,  9, 11]], dtype=int32)

In [16]:
ragged_vectorize = tf.keras.layers.TextVectorization(ragged=True)

ragged_vectorize.adapt(sentences)

ragged_sequences = ragged_vectorize(sentences)

ragged_sequences

<tf.RaggedTensor [[4, 3, 2, 5], [4, 3, 2, 17], [6, 3, 2, 5], [15, 6, 8, 2, 5, 12, 20],
 [4, 3, 7, 14, 16, 19, 10, 18, 13, 9, 11]]>

In [19]:
sequences_pre = tf.keras.utils.pad_sequences(ragged_sequences.numpy(), padding='pre')

sequences_pre

array([[ 0,  0,  0,  0,  0,  0,  0,  4,  3,  2,  5],
       [ 0,  0,  0,  0,  0,  0,  0,  4,  3,  2, 17],
       [ 0,  0,  0,  0,  0,  0,  0,  6,  3,  2,  5],
       [ 0,  0,  0,  0, 15,  6,  8,  2,  5, 12, 20],
       [ 4,  3,  7, 14, 16, 19, 10, 18, 13,  9, 11]], dtype=int32)