In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

# Initialises the tokenizer (out of vocabulary token added)
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
# Fits the tokenizer on the sentences list
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [7]:
# Sentences encoded into integer lists
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


### Padding
Uniformity of input size is still an important requirement when training neural networks on sequences of text.

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
# Pads the sequences
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [10]:
# Pads the sequences (post padding)
post_padding = pad_sequences(sequences, padding='post')
print(post_padding)

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]


In [11]:
# Pads the sequences (post padding) to max length
post_padding_max_len = pad_sequences(sequences, padding='post', maxlen=5)
print(post_padding_max_len)

[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 9  2  4 10 11]]
