In [1]:
import re
import string
import tensorflow as tf
from tensorflow import keras

In [2]:
TRAIN_DIR = 'aclImdb/train'
VAL_SPLIT = 0.2
BATCH_SIZE = 1024
SEED = 42

In [3]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

In [4]:
train_ds = keras.preprocessing.text_dataset_from_directory(
    TRAIN_DIR, 
    batch_size=BATCH_SIZE, 
    validation_split=VAL_SPLIT, 
    subset='training',
    seed=SEED
)

val_ds = keras.preprocessing.text_dataset_from_directory(
    TRAIN_DIR, 
    batch_size=BATCH_SIZE, 
    validation_split=VAL_SPLIT, 
    subset='validation', 
    seed=SEED
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [5]:
for x_batch, y_batch in train_ds.take(1):
    break

print(x_batch[0])

tf.Tensor(b"I married a Japanese woman 14 years ago. We're still together.<br /><br />However in the 1950's it would never have been as easy.<br /><br />Life in the military had been mined for action, drama, and comedy for years by this point. Mined to death. The mixed relationships gave it new ground to cover. This is old hat today, but then...? Marrying an Asian back then meant you either owed somebody something or you were a freak of some sort. This touched on both possibilities along with the third. Maybe it IS love? <br /><br />Brando did his usual good job. Garner did a better job than he usually does. He's good, but this showed how good he could be. Umecki-chan had a helluva debut here and while I think she earned her statue, she didn't really stretch. It was a role that no one who hadn't been overseas would have recognized and the newness was the corker.<br /><br />The real scene stealer was Red Buttons. Red was the best thing in this film. Bank on it. And the Japanese lifestyl

In [6]:
x_batch[4]

<tf.Tensor: shape=(), dtype=string, numpy=b"Reese Witherspoon plays Dani, a young country girl that falls madly in love with the new 17 year old neighbor, Court, played by Jason London. Court tries his best to make Dani realize that the difference in their ages would make a love relationship improbable. Soon the nubile charm of Dani starts winning over Court's will. Next enters the meeting of Dani's older sister, played by Emily Warfield, and the beginning of a short lived love/jealousy problem.<br /><br />Tess Harper and Sam Waterston round out the cast. This is a fresh, free spirited; but heartbreaking drama that touches down deep. Feel free to cry.">

In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, f'[{re.escape(string.punctuation)}]', '')

vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [8]:
vectorize_layer(['hello world'])

<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[5128,  182,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int64)>

In [9]:
vocab = vectorize_layer.get_vocabulary()
print(f"len(vocab) = {len(vocab)}")
print(f"vocab[:10] = {vocab[:10]}")

len(vocab) = 10000
vocab[:10] = ['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']


In [10]:
def decode(sequence):
    return " ".join([vocab[i] for i in sequence])

In [11]:
x_batch_processed = vectorize_layer(x_batch)
print(f"x_batch_processed.shape = {x_batch_processed.shape}")

x_batch_processed.shape = (1024, 100)


In [12]:
decode(x_batch_processed[0])

'i married a japanese woman 14 years ago were still together however in the 1950s it would never have been as easy life in the military had been [UNK] for action drama and comedy for years by this point [UNK] to death the mixed relationships gave it new ground to cover this is old hat today but then marrying an asian back then meant you either [UNK] somebody something or you were a freak of some sort this touched on both possibilities along with the third maybe it is love brando did his usual good job garner did a better'