In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

texts = [
    "It was the best of times,",
    "it was the worst of times,",
]

tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(texts)

print("word_index = {}".format(tokenizer.word_index))
print("index_word = {}".format(tokenizer.index_word))

word_index = {'it': 1, 'was': 2, 'the': 3, 'of': 4, 'times': 5, 'best': 6, 'worst': 7}
index_word = {1: 'it', 2: 'was', 3: 'the', 4: 'of', 5: 'times', 6: 'best', 7: 'worst'}


In [2]:
texts = [
    "we had everything before us,",
    "we had nothing before us,",
    "we were all going direct to Heaven,",
]

tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(texts)
print("word_index = {}\n".format(tokenizer.word_index))
print("index_word = {}\n".format(tokenizer.index_word))

seq = tokenizer.texts_to_sequences(texts)
print("seq = {}".format(seq))

word_index = {'we': 1, 'had': 2, 'before': 3, 'us': 4, 'everything': 5, 'nothing': 6, 'were': 7, 'all': 8, 'going': 9, 'direct': 10, 'to': 11, 'heaven': 12}

index_word = {1: 'we', 2: 'had', 3: 'before', 4: 'us', 5: 'everything', 6: 'nothing', 7: 'were', 8: 'all', 9: 'going', 10: 'direct', 11: 'to', 12: 'heaven'}

seq = [[1, 2, 5, 3, 4], [1, 2, 6, 3, 4], [1, 7, 8, 9, 10, 11, 12]]


In [3]:
tokenizer.texts_to_sequences(["we were all going direct the other way"])

[[1, 7, 8, 9, 10]]

In [4]:
texts = [
    "we had everything before us,",
    "we had nothing before us,",
    "we were all going direct to Heaven,",
]

tokenizer = Tokenizer(num_words=50, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
print("word_index = {}".format(tokenizer.word_index))

seq = tokenizer.texts_to_sequences(texts)
print("seq = {}".format(seq))

word_index = {'<OOV>': 1, 'we': 2, 'had': 3, 'before': 4, 'us': 5, 'everything': 6, 'nothing': 7, 'were': 8, 'all': 9, 'going': 10, 'direct': 11, 'to': 12, 'heaven': 13}
seq = [[2, 3, 6, 4, 5], [2, 3, 7, 4, 5], [2, 8, 9, 10, 11, 12, 13]]


In [5]:
tokenizer.texts_to_sequences(["we were all going direct the other way"])

[[2, 8, 9, 10, 11, 1, 1, 1]]

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

seq = tokenizer.texts_to_sequences(texts)
print("seq = {}".format(seq))

padded = pad_sequences(seq)
print("padded = \n{}".format(padded))

seq = [[2, 3, 6, 4, 5], [2, 3, 7, 4, 5], [2, 8, 9, 10, 11, 12, 13]]
padded = 
[[ 0  0  2  3  6  4  5]
 [ 0  0  2  3  7  4  5]
 [ 2  8  9 10 11 12 13]]


In [7]:
padded = pad_sequences(seq, padding='post', maxlen=6)
print("padded = \n{}".format(padded))

padded = 
[[ 2  3  6  4  5  0]
 [ 2  3  7  4  5  0]
 [ 8  9 10 11 12 13]]


In [8]:
padded = pad_sequences(seq, padding='post', maxlen=6, truncating='post')
print("padded = \n{}".format(padded))

padded = 
[[ 2  3  6  4  5  0]
 [ 2  3  7  4  5  0]
 [ 2  8  9 10 11 12]]
