# Tokenization
## fit_on_texts()

In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tf.__version__

'2.0.0'

In [34]:
sentences = [
    'i love my dog',
    'I, love my cat', # removes punctuation automatically
    'Do you think my ? dog is amazing'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'cat': 5, 'do': 6, 'you': 7, 'think': 8, 'is': 9, 'amazing': 10}


## texts_to_sequences()

In [35]:
tokenizer.texts_to_sequences(sentences)

[[2, 3, 1, 4], [2, 3, 1, 5], [6, 7, 8, 1, 4, 9, 10]]

In [36]:
# can fit the tokenizer to different sentences
test_sentences = ["i really love my manatee", "i love my cat"] # unknown words are ignored
tokenizer.texts_to_sequences(test_sentences)

[[2, 3, 1], [2, 3, 1, 5]]

### out of vocabulary
To account for unknown words when we fit on test_sentences

In [47]:
tokenizer2 = Tokenizer(num_words = 100, oov_token= "UNK") # replace words that are out of vocabulary
tokenizer2.fit_on_texts(sentences)
print(tokenizer2.word_index)

{'UNK': 1, 'my': 2, 'i': 3, 'love': 4, 'dog': 5, 'cat': 6, 'do': 7, 'you': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [48]:
print(tokenizer2.texts_to_sequences(test_sentences))

[[3, 1, 4, 2, 1], [3, 4, 2, 6]]


## Padding
Because inputs to model eventually need to be the same shape

In [45]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_sequences(tokenizer2.texts_to_sequences(test_sentences)) # padding = "post" (default is "pre"), maxlen = 5

array([[3, 1, 4, 2, 1],
       [0, 3, 4, 2, 6]], dtype=int32)

In [49]:
# make sequences of a specifc shape to be fed to a model
pad_sequences(tokenizer2.texts_to_sequences(test_sentences), maxlen = 4) 
# since the padding is "pre", the words at the start of a string that is too long are removed
# but usually we set maxlen to be the length of the longest sentence

array([[1, 4, 2, 1],
       [3, 4, 2, 6]], dtype=int32)