<a href="https://colab.research.google.com/github/lmoroney/tfbook/blob/master/chapter5/first-sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass


In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

# Initial Tokenization

In [0]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
]

In [0]:
# Initial tokenization of the corpus, no OOV used
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

# Exploring Test Data with unseen words

In [0]:
test_data = [
  'Today is a snowy day',
  'Will it be rainy tomorrow?'
]

In [0]:
test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

# Adding OOV to improve test data sequences

In [0]:
# Here you can re-tokenize with an OOV token
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

# Exploring Padding

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

In [0]:
# Re-tokenize with the new sentences from above
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

In [0]:
padded = pad_sequences(sequences)

print(padded)

In [0]:
padded = pad_sequences(sequences, padding='post')

print(padded)

In [0]:
padded = pad_sequences(sequences, padding='post', maxlen=6)

print(padded)

In [0]:
padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')

print(padded)