In [1]:
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Tokenizing
- Gives words a unique value.

**`tf.keras.preprocessing.text.Tokenizer()`**
- By default, removes all punctuations.
- Converts all words to lowercase.
- [Tensorflow doc](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

**Transform words**

In [2]:
sentences = [
    'I love my dog',
    'i, love my cat',
]

# initiate a tokenizer
tokenizer = Tokenizer(num_words=100)
# update internal vocabulary based on the given list of texts
tokenizer.fit_on_texts(sentences)

# get the tokens
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


**Note**: - Calling `fit_on_texts()` on new texts will appned new words to the old vocabulary. 

In [3]:
new_sentences = [
    'You love dog!'
]

# update internal vocabulary based on the given list of texts
tokenizer.fit_on_texts(new_sentences)

# see the new tokens
tokenizer.word_index

{'love': 1, 'i': 2, 'my': 3, 'dog': 4, 'cat': 5, 'you': 6}

**Transform sequence of words (a.k.a. text or sentence)**

In [4]:
# transform each sentence to a sequence of integers
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[2, 1, 3, 4], [2, 1, 3, 5]]


**Note**: - Unknown words are ignored!

In [5]:
# a sentence with unfamiliar words
tokenizer.texts_to_sequences(['She loves dog!'])

[[4]]

**Note**:- Use the `oov_token` parameter in `Tokenizer()` to tokenize out-of-vocabulary words.

In [6]:
# initiate a tokenizer
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

# update internal vocabulary based on the given list of texts
tokenizer.fit_on_texts(sentences)
tokenizer.fit_on_texts(new_sentences)

# get the tokens
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'love': 2, 'i': 3, 'my': 4, 'dog': 5, 'cat': 6, 'you': 7}


In [7]:
# a sentence with unfamiliar words
tokenizer.texts_to_sequences(['She loves dog!'])

[[1, 1, 5]]

## Padding


**`tf.keras.preprocessing.sequence.pad_sequences()`**
- Pad sequences by adding a value (param: `value`) at the beginning (`padding='pre'`) or ending (`padding='post'`).
- Makes all sequences to equal length (param: `maxlen`). 
- If `maxlen` is not provided, sequences will be padded to the length of the longest individual sequence.  
- If a sequence is longer than the `maxlen`, truncates the beginning (`truncating='pre'`) or ending (`truncating='post'`).
- [Tensorflow doc](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences)

In [8]:
# get sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
print()

# padding
padded = pad_sequences(sequences, padding='post', maxlen=6)
print(padded)

[[3, 2, 4, 5], [3, 2, 4, 6]]

[[3 2 4 5 0 0]
 [3 2 4 6 0 0]]


# **EXAMPLE**
**Sarcasm Dataset**

In [9]:
import os
import json

DATA_URL = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json'
PATH_OUT = '../.tmp/sarcasm.json'

# download the dataset
os.system(f"""wget --no-check-certificate {DATA_URL} -O {PATH_OUT}""")

# read the file
with open(PATH_OUT, 'r') as f:
    data = json.load(f)

In [10]:
# see one example
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [11]:
# extract all headlines
headlines = []
for item in data:
    headlines.append(item['headline'])

# see one example
headlines[0]

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [12]:
# initiate a tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')

# update the vocabulary
tokenizer.fit_on_texts(headlines)

print('# words in the vocabulary:', len(tokenizer.word_index))

# words in the vocabulary: 29657


In [13]:
# get sequences
sequences = tokenizer.texts_to_sequences(headlines)

# padding
padded = pad_sequences(sequences, padding='post')

print('padded sequences:')
print(padded)
print('shape', padded.shape)

padded sequences:
[[  308 15115   679 ...     0     0     0]
 [    4  8435  3338 ...     0     0     0]
 [  145   838     2 ...     0     0     0]
 ...
 [10735     9    68 ...     0     0     0]
 [ 1541   392  4164 ...     0     0     0]
 [29656  1647     6 ...     0     0     0]]
shape (26709, 40)


## Pre-tokenized Dataset

**IMDB movie review dataset** ([tfds webpage](https://www.tensorflow.org/datasets/catalog/imdb_reviews)) 

In [14]:
# get the dataset
imdb, info = tfds.load(
    'imdb_reviews/subwords8k', with_info=True, as_supervised=True, shuffle_files=False
    )

In [15]:
# get the tokenizer
tokenizer = info.features['text'].encoder

In [16]:
# check one string
sample_string = 'I love to watch movies. IMDb is a good source to check movie rating!'

tokenized_string = tokenizer.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

Tokenized string is [12, 174, 7, 163, 323, 3, 2939, 9, 74, 5038, 7961, 7, 1843, 27, 2296, 7962]
The original string: I love to watch movies. IMDb is good source to check movie rating!
