In [1]:
import tensorflow_datasets as tfds

train_data, test_data = tfds.load("imdb_reviews", as_supervised=True, split=["train", "test"])
n_features = 1000  # Number of unique words to consider
max_length = 75   # CRIT:  len of review to consider

In [2]:
from tensorflow.keras.layers import TextVectorization

v = TextVectorization(output_sequence_length=max_length, output_mode='int')

# Smaller batch size may decresase vocab size
batch = train_data.batch(1000).as_numpy_iterator()
for texts, labels in batch:
    v.adapt(texts)

In [3]:
# Detrermine n_features and not just fix it at 1000 arbitrarily
n_features = v.vocabulary_size()
print("Vocab size:", n_features)

Vocab size: 20463


In [4]:
def prepare_dataset(dataset):
    dataset = dataset.map(lambda text, label: (v(text), label))
    # dataset = dataset.map(lambda text, label: (tf.one_hot(text, depth=n_features), label))
    return dataset


# Prepare the training and testing datasets
pro_train = prepare_dataset(train_data).batch(128)
pro_test = prepare_dataset(test_data).batch(128)

In [5]:
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


inputs = Input(shape=(max_length, ))

x = Embedding(input_dim=n_features, output_dim=32)(inputs)

x = LSTM(64, recurrent_dropout=0.3, return_sequences=True)(x)
x = LSTM(32, recurrent_dropout=0.3)(x)

outputs = Dense(1, activation='sigmoid',)(x)

model = Model(inputs, outputs)

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [6]:
model.fit(pro_train, validation_data=pro_test,
          epochs=15, callbacks=early_stopping)

Epoch 1/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 154ms/step - accuracy: 0.6139 - loss: 0.6310 - val_accuracy: 0.7801 - val_loss: 0.4798
Epoch 2/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 153ms/step - accuracy: 0.8160 - loss: 0.4301 - val_accuracy: 0.7765 - val_loss: 0.5083
Epoch 3/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 153ms/step - accuracy: 0.8489 - loss: 0.3794 - val_accuracy: 0.7732 - val_loss: 0.5205
Epoch 4/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 153ms/step - accuracy: 0.8727 - loss: 0.3242 - val_accuracy: 0.7715 - val_loss: 0.5872
Epoch 5/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 154ms/step - accuracy: 0.8913 - loss: 0.2904 - val_accuracy: 0.7570 - val_loss: 0.6410
Epoch 6/15
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 89ms/step - accuracy: 0.9045 - loss: 0.2581 - val_accuracy: 0.7494 - val_loss: 0.7062


<keras.src.callbacks.history.History at 0x2ce1b294850>

In [9]:
loss, acc = model.evaluate(pro_test)

print("Test Loss:", loss)
print("Test accuracy:", acc*100,"%")

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.7831 - loss: 0.4781
Test Loss: 0.4798335134983063
Test accuracy: 78.00800204277039 %
