In [1]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = [
    "I watch anime",
    "Deep Learning is part of AI",
    "OnePiece is my favourite Anime",
    "Generative AI is new trend in the market"
]

In [14]:
tokenizer = Tokenizer(num_words=50,oov_token="<UNK>") #nums_of_word will not influence the word index
#instead it will effect the text sequence
#it will only be trained on num_words-1 words, rest all will be neglected
#If you don’t use a token for out of vocabulary(oov_token) words, The word isn’t encoded, and is skipped in the sequence

In [15]:
tokenizer.fit_on_texts(sentences)  #used to tokenize the list of sentences

In [16]:
index = tokenizer.word_index  #returns dictionary of the word index based on the most occured words

In [17]:
index

{'<UNK>': 1,
 'is': 2,
 'anime': 3,
 'ai': 4,
 'i': 5,
 'watch': 6,
 'deep': 7,
 'learning': 8,
 'part': 9,
 'of': 10,
 'onepiece': 11,
 'my': 12,
 'favourite': 13,
 'generative': 14,
 'new': 15,
 'trend': 16,
 'in': 17,
 'the': 18,
 'market': 19}

In [18]:
seq = tokenizer.texts_to_sequences(sentences) #used to encode a list of sentences
print(seq)

[[5, 6, 3], [7, 8, 2, 9, 10, 4], [11, 2, 12, 13, 3], [14, 4, 2, 15, 16, 17, 18, 19]]


In [19]:
new = ["Generative AI and Deep Learning is way to upskill in AI"]

In [20]:
find_new_seq = tokenizer.texts_to_sequences(new)

In [21]:
print(find_new_seq)

[[14, 4, 1, 7, 8, 2, 1, 1, 1, 17, 4]]


In [28]:
padded = pad_sequences(find_new_seq,truncating="post",padding="post", maxlen=30)

In [29]:
padded

array([[14,  4,  1,  7,  8,  2,  1,  1,  1, 17,  4,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [31]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

# Generate the sequences
test_seq = tokenizer.texts_to_sequences(test_data)

# Print the word index dictionary
print("\nWord Index = " , index)

# Print the sequences with OOV
print("\nTest Sequence = ", test_seq)

# Print the padded result
padded = pad_sequences(test_seq,truncating="post", maxlen=20)
print("\nPadded Test Sequence: ")
print(padded)


Word Index =  {'<UNK>': 1, 'is': 2, 'anime': 3, 'ai': 4, 'i': 5, 'watch': 6, 'deep': 7, 'learning': 8, 'part': 9, 'of': 10, 'onepiece': 11, 'my': 12, 'favourite': 13, 'generative': 14, 'new': 15, 'trend': 16, 'in': 17, 'the': 18, 'market': 19}

Test Sequence =  [[5, 1, 1, 12, 1], [12, 1, 1, 12, 1]]

Padded Test Sequence: 
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  1  1 12  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 12  1  1 12  1]]
