<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/c3_w1_padding_oov_sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Tokenizer

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tf.__version__

'2.3.0'

In [3]:
sentences = [
  'I love my dog',
  'Cats are more independent than dogs',
  'I, love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?',
  'I am constantly thinking about my dog.',
  'I am loving my dog.',
  'She loves her cat'
]

tokenizer = Tokenizer(num_words=100, oov_token='<oob>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<oob>': 1,
 'about': 21,
 'am': 8,
 'amazing': 18,
 'are': 10,
 'cat': 6,
 'cats': 9,
 'constantly': 19,
 'do': 15,
 'dog': 3,
 'dogs': 14,
 'her': 25,
 'i': 4,
 'independent': 12,
 'is': 17,
 'love': 5,
 'loves': 24,
 'loving': 22,
 'more': 11,
 'my': 2,
 'she': 23,
 'than': 13,
 'think': 16,
 'thinking': 20,
 'you': 7}

In [4]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[4, 5, 2, 3],
 [9, 10, 11, 12, 13, 14],
 [4, 5, 2, 6],
 [7, 5, 2, 3],
 [15, 7, 16, 2, 3, 17, 18],
 [4, 8, 19, 20, 21, 2, 3],
 [4, 8, 22, 2, 3],
 [23, 24, 25, 6]]

In [5]:
padded = pad_sequences(sequences, maxlen=5)
padded

array([[ 0,  4,  5,  2,  3],
       [10, 11, 12, 13, 14],
       [ 0,  4,  5,  2,  6],
       [ 0,  7,  5,  2,  3],
       [16,  2,  3, 17, 18],
       [19, 20, 21,  2,  3],
       [ 4,  8, 22,  2,  3],
       [ 0, 23, 24, 25,  6]], dtype=int32)

In [6]:
test_data = [
  'i really love my new puppy',
  'my dog loves my manatee',
  'cats are cool'
]

test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[4, 1, 5, 2, 1, 1], [2, 3, 24, 2, 1], [9, 10, 1]]

In [7]:
padded = pad_sequences(test_seq, maxlen=10)
padded

array([[ 0,  0,  0,  0,  4,  1,  5,  2,  1,  1],
       [ 0,  0,  0,  0,  0,  2,  3, 24,  2,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  9, 10,  1]], dtype=int32)

In [9]:
post_padded = pad_sequences(test_seq, maxlen=10, padding='post')
post_padded

array([[ 4,  1,  5,  2,  1,  1,  0,  0,  0,  0],
       [ 2,  3, 24,  2,  1,  0,  0,  0,  0,  0],
       [ 9, 10,  1,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [11]:
pre_padding_truncating = pad_sequences(test_seq, maxlen=5, truncating='pre')
pre_padding_truncating

array([[ 1,  5,  2,  1,  1],
       [ 2,  3, 24,  2,  1],
       [ 0,  0,  9, 10,  1]], dtype=int32)

In [12]:
post_padding_truncating = pad_sequences(test_seq, maxlen=5, truncating='post')
post_padding_truncating

array([[ 4,  1,  5,  2,  1],
       [ 2,  3, 24,  2,  1],
       [ 0,  0,  9, 10,  1]], dtype=int32)