In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
TRAIN_DIR = 'aclImdb/train'
VAL_SPLIT = 0.2
# BATCH_SIZE = 1024
SEED = 42

In [3]:
VOCAB_SIZE = 10000
OOV_TOKEN = '<OOV>'
MAX_SEQUENCE_LENGTH = 100
TRUNCATING = 'post'
PADDING = 'post'

In [4]:
train_ds = keras.preprocessing.text_dataset_from_directory(
    TRAIN_DIR, 
    batch_size=1, 
    validation_split=VAL_SPLIT, 
    subset='training',
    seed=SEED
)

val_ds = keras.preprocessing.text_dataset_from_directory(
    TRAIN_DIR, 
    batch_size=1, 
    validation_split=VAL_SPLIT, 
    subset='validation', 
    seed=SEED
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [5]:
train_sentences = []
train_labels = []

val_sentences = []
val_labels = []

for sentence, label in train_ds:
    train_sentences.append(sentence[0].numpy().decode('utf8'))
    train_labels.append(label[0].numpy())

for sentence, label in val_ds:
    val_sentences.append(sentence[0].numpy().decode('utf8'))
    val_labels.append(label[0].numpy())

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [6]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [7]:
[reverse_word_index[i] for i in range(1, 11)]

['<OOV>', 'the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it']

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
print(f"type(train_sequences) = {type(train_sequences)}")
print(f"train_sequences[0] = {train_sequences[0]}")

type(train_sequences) = <class 'list'>
train_sequences[0] = [2, 324, 1, 4, 53, 350, 20, 17, 4, 174, 5, 1612, 224, 431, 3, 894, 2, 529, 1772, 51, 2, 1341, 45, 1065, 4, 350, 20, 5, 316, 485, 13, 553, 27, 1054, 6, 98, 82, 350, 20, 13, 11, 26, 108, 11, 26, 1135, 12, 20, 31, 2, 3844, 623, 686, 11, 79, 380, 13, 10, 7, 601, 6, 1087, 1, 3, 960, 80, 9, 98, 6917, 49, 264, 44, 5, 10, 7, 2, 189, 13, 80, 17, 1008, 24, 114, 7812, 6, 65, 541, 7, 114, 557, 10, 7, 4, 2030, 6, 27, 2108, 32, 98, 1087, 13, 488, 6, 137, 930, 11, 435, 13, 2, 1065, 78, 2157, 4, 332, 173, 6, 12, 1, 11, 166, 930, 6, 843, 2, 730, 277, 170, 1, 6, 1, 1753, 9, 3845, 4, 20, 5, 7675, 3, 485, 17, 4, 2030, 9, 328]


In [9]:
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding=PADDING, truncating=TRUNCATING)
print(f"type(train_padded) = {type(train_padded)}")
print(f"train_padded.shape = {train_padded.shape}")
print(f"train_padded[0] = \n{train_padded[0]}")

type(train_padded) = <class 'numpy.ndarray'>
train_padded.shape = (20000, 100)
train_padded[0] = 
[   2  324    1    4   53  350   20   17    4  174    5 1612  224  431
    3  894    2  529 1772   51    2 1341   45 1065    4  350   20    5
  316  485   13  553   27 1054    6   98   82  350   20   13   11   26
  108   11   26 1135   12   20   31    2 3844  623  686   11   79  380
   13   10    7  601    6 1087    1    3  960   80    9   98 6917   49
  264   44    5   10    7    2  189   13   80   17 1008   24  114 7812
    6   65  541    7  114  557   10    7    4 2030    6   27 2108   32
   98 1087]


In [10]:
def decode_sequence(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

In [11]:
print(f"{'-' * 16} decoded {'-' * 16} \n{decode_sequence(train_padded[0])}")
print(f"{'-' * 16} original {'-' * 16} \n{}")

SyntaxError: f-string: empty expression not allowed (<ipython-input-11-ace1c8d0e59d>, line 2)