In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer 

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!'
]

# it will do basic data cleaning - lowercase, remove punctuations

# num_words will take the "n" most frequent words
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
# numbers representing words as key-value pairs
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [2]:
sentences2 = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# will encode all the sentences and convert to set to sequences 
sequences = tokenizer.texts_to_sequences(sentences2)
print(sequences)

#it wont encode the word which were not included in fit_on_texts. for eg: do, amazing

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}
[[3, 1, 2, 4], [3, 1, 2, 5], [6, 1, 2, 4], [6, 2, 4]]


In [3]:
sentences2 = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
# OOV - OUT OF VOCABULARY :: it will replace all the unseen words with OOV 
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# will encode all the sentences and convert to set to sequences 
sequences = tokenizer.texts_to_sequences(sentences2)
print(sequences)

{'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7}
[[4, 2, 3, 5], [4, 2, 3, 6], [7, 2, 3, 5], [1, 7, 1, 3, 5, 1, 1]]


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
# OOV - OUT OF VOCABULARY :: it will replace all the unseen words with OOV 
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# will encode all the sentences and convert to set to sequences 
sequences = tokenizer.texts_to_sequences(sentences)

# padding before sentences or after (post), 
# maxlen for maximum number of words in a sentence
# truncate words before or after a sentence
padded = pad_sequences(sequences, padding="post", maxlen=5)

print(sequences)
print(padded)


{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 9  2  4 10 11]]
