## Bag Of Words Baseline

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

train_data, test_data = tfds.load("imdb_reviews", as_supervised=True,
                                  split=["train", "test"], shuffle_files=True)

In [2]:
x = train_data.batch(10).as_numpy_iterator()
x.next()

(array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
        b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plo

### Vectorization

In [3]:
from tensorflow.keras.layers import TextVectorization

In [4]:
# Number of words to consider
n_features = 1000

# CRITICAL: output_mode=binary or output_mode=count, rather than default output_mode=int
# default int index was making the model accuracy stuck at 0.5 or overfit with this setup
v = TextVectorization(max_tokens=n_features, output_mode='count')

# Build the vocabulary
batch = train_data.batch(64).as_numpy_iterator()
for texts, y in batch:
    v.adapt(texts)

In [5]:
# n_features = 1000
# v = TextVectorization(max_tokens=n_features, output_mode='count')
# texts = train_data.map(lambda x, y: x)
# v.adapt(texts)

In [6]:
len(v.get_vocabulary())

1000

In [7]:
batch = train_data.batch(1000).as_numpy_iterator()
texts = batch.next()[0]

v(texts)

<tf.Tensor: shape=(1000, 1000), dtype=int64, numpy=
array([[ 37,   2,   2, ...,   0,   0,   0],
       [ 24,   5,   2, ...,   0,   0,   0],
       [ 53,  10,   5, ...,   0,   0,   0],
       ...,
       [ 43,   5,   3, ...,   0,   0,   0],
       [ 26,   6,   1, ...,   0,   0,   0],
       [297,  71,  20, ...,   0,   0,   0]])>

In [8]:
def prep_dataset(dataset, vectorizer):
    data = dataset.map(lambda texts, labels: (vectorizer(texts), labels))
    return data


pro_train = prep_dataset(train_data, v).batch(64)
pro_test = prep_dataset(test_data, v).batch(64)

### Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=3, 
    restore_best_weights=True
)

model = Sequential([
    Input(shape=(n_features,)),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

### Train

In [12]:
model.fit(
    pro_train,
    validation_data=pro_test,
    epochs=10,
    callbacks=early_stopping
)

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 56ms/step - accuracy: 0.6919 - loss: 0.6426 - val_accuracy: 0.8334 - val_loss: 0.3934
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 63ms/step - accuracy: 0.8198 - loss: 0.4141 - val_accuracy: 0.8410 - val_loss: 0.3700
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 53ms/step - accuracy: 0.8373 - loss: 0.3749 - val_accuracy: 0.8438 - val_loss: 0.3633
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 53ms/step - accuracy: 0.8494 - loss: 0.3533 - val_accuracy: 0.8465 - val_loss: 0.3562
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 51ms/step - accuracy: 0.8533 - loss: 0.3366 - val_accuracy: 0.8476 - val_loss: 0.3550
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 56ms/step - accuracy: 0.8593 - loss: 0.3245 - val_accuracy: 0.8480 - val_loss: 0.3543
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x23365e7c690>

### Evaulate

In [13]:
loss, accuracy = model.evaluate(pro_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 28ms/step - accuracy: 0.8454 - loss: 0.3569
Test Loss: 0.3543
Test Accuracy: 0.8480
