# Import Libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create Few Sentences

In [2]:
sentences = [
    "He is happy",
    "He is learning LSTM Network",
    "He passes his exam easily"
]

# Tokenization

In [3]:
vocab_size = 20000 # 3000 words cover 95% of english text so vacab_size = 20000 are enough
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [4]:
print(sequences)

[[1, 2, 3], [1, 2, 4, 5, 6], [1, 7, 8, 9, 10]]


# Word to Index Mapping

In [5]:
tokenizer.word_index

{'he': 1,
 'is': 2,
 'happy': 3,
 'learning': 4,
 'lstm': 5,
 'network': 6,
 'passes': 7,
 'his': 8,
 'exam': 9,
 'easily': 10}

# Pad the sequence

In [6]:
data = pad_sequences(sequences) # use the defaults. By Default padding is pre
print(data)

[[ 0  0  1  2  3]
 [ 1  2  4  5  6]
 [ 1  7  8  9 10]]


In [7]:
max_seq_len = 5
data = pad_sequences(sequences, maxlen = max_seq_len )
print(data)

[[ 0  0  1  2  3]
 [ 1  2  4  5  6]
 [ 1  7  8  9 10]]


In [8]:
data = pad_sequences(sequences, maxlen = max_seq_len , padding='post')
print(data)

[[ 1  2  3  0  0]
 [ 1  2  4  5  6]
 [ 1  7  8  9 10]]


In [9]:
data = pad_sequences(sequences, maxlen = 6) # more padding
print(data)

[[ 0  0  0  1  2  3]
 [ 0  1  2  4  5  6]
 [ 0  1  7  8  9 10]]


# Data Truncation

In [10]:
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 0  1  2  3]
 [ 2  4  5  6]
 [ 7  8  9 10]]


In [11]:
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[0 1 2 3]
 [1 2 4 5]
 [1 7 8 9]]
