In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = [
    "I Love my dog",
    "I Love my cat"
]

In [3]:
sentences

['I Love my dog', 'I Love my cat']

In [4]:
tokenizer = Tokenizer(num_words=100)

In [5]:
tokenizer.fit_on_texts(sentences)

In [6]:
word_index = tokenizer.word_index

In [7]:
word_index

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}

# Sequences

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
sentences = [
    "I Love my dog",
    "I Love my cat",
    "You Love my dog very much",
    "Do you think my dog is awesome"
]

In [10]:
tokenizer = Tokenizer(num_words=100)

In [11]:
tokenizer.fit_on_texts(sentences)

In [12]:
word_index = tokenizer.word_index

In [13]:
word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'very': 7,
 'much': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'awesome': 12}

In [14]:
sequences = tokenizer.texts_to_sequences(sentences)

In [15]:
sentences

['I Love my dog',
 'I Love my cat',
 'You Love my dog very much',
 'Do you think my dog is awesome']

In [16]:
print(sequences)

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3, 7, 8], [9, 5, 10, 1, 3, 11, 12]]


In [17]:
test_data = [
    "i really love my dog",
    "my dog love my brother"
]

In [18]:
test_data

['i really love my dog', 'my dog love my brother']

In [19]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [20]:
test_seq

[[4, 2, 1, 3], [1, 3, 2, 1]]

# Out of Vocabulary

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
sentences = [
    "I Love my dog",
    "I Love my cat",
    "You Love my dog very much",
    "Do you think my dog is awesome"
]

In [23]:
sentences

['I Love my dog',
 'I Love my cat',
 'You Love my dog very much',
 'Do you think my dog is awesome']

In [24]:
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")

In [25]:
tokenizer.fit_on_texts(sentences)

In [26]:
word_index = tokenizer.word_index

In [27]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'very': 8,
 'much': 9,
 'do': 10,
 'think': 11,
 'is': 12,
 'awesome': 13}

In [28]:
sequences = tokenizer.texts_to_sequences(sentences)

In [29]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4, 8, 9], [10, 6, 11, 2, 4, 12, 13]]

In [30]:
test_data

['i really love my dog', 'my dog love my brother']

In [31]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [32]:
test_seq

[[5, 1, 3, 2, 4], [2, 4, 3, 2, 1]]

# Padding

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
sentences = [
    "I Love my dog",
    "I Love my cat",
    "You Love my dog very much",
    "Do you think my dog is awesome"
]

In [35]:
sentences

['I Love my dog',
 'I Love my cat',
 'You Love my dog very much',
 'Do you think my dog is awesome']

In [36]:
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")

In [37]:
tokenizer.fit_on_texts(sentences)

In [38]:
word_index = tokenizer.word_index

In [39]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'very': 8,
 'much': 9,
 'do': 10,
 'think': 11,
 'is': 12,
 'awesome': 13}

In [40]:
sequences = tokenizer.texts_to_sequences(sentences)

In [41]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4, 8, 9], [10, 6, 11, 2, 4, 12, 13]]

In [42]:
padded = pad_sequences(sequences)

In [43]:
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  6,  3,  2,  4,  8,  9],
       [10,  6, 11,  2,  4, 12, 13]], dtype=int32)

In [44]:
padded1 = pad_sequences(sequences,padding='post')

In [45]:
padded1

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  8,  9,  0],
       [10,  6, 11,  2,  4, 12, 13]], dtype=int32)

In [46]:
padded2 = pad_sequences(sequences,truncating="pre",maxlen=5)

In [47]:
padded2

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2,  7],
       [ 3,  2,  4,  8,  9],
       [11,  2,  4, 12, 13]], dtype=int32)

In [48]:
padded3 = pad_sequences(sequences,truncating="post",maxlen=5)

In [49]:
padded3

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2,  7],
       [ 6,  3,  2,  4,  8],
       [10,  6, 11,  2,  4]], dtype=int32)