In [21]:
import tensorflow

In [22]:
from tensorflow import keras

In [23]:
from keras import layers, models, optimizers, losses, metrics

In [24]:
!tar xf aclImdb_v1.tar.gz  

In [25]:
!rm -rf aclImdb/train/unsup

In [26]:
!ls aclImdb/train/

labeledBow.feat  pos		urls_neg.txt  urls_unsup.txt
neg		 unsupBow.feat	urls_pos.txt


In [27]:
import re
import string

def standardize(text):
    text = tensorflow.strings.lower(text)
    text = tensorflow.strings.regex_replace(text, '<br />', '')
    text = tensorflow.strings.regex_replace(text, '[%s]' % re.escape(string.punctuation), '')
    return text

In [28]:
standardize('aAbcd')

<tf.Tensor: shape=(), dtype=string, numpy=b'aabcd'>

In [29]:
'[%s]' % re.escape(string.punctuation)

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]'

In [30]:
re.escape(string.punctuation)

'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'

In [31]:
batch_size=50
seed=50

raw_train_ds = keras.utils.text_dataset_from_directory('aclImdb/train/', batch_size=batch_size, validation_split=0.2, seed=seed, subset='training')
raw_val_ds = keras.utils.text_dataset_from_directory('aclImdb/train/', batch_size=batch_size, validation_split=0.2, seed=seed, subset='validation')
raw_test_ds = keras.utils.text_dataset_from_directory('aclImdb/test/', batch_size=batch_size, validation_split=0.2, seed=seed, subset='validation')

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [32]:
max_features=10000
seq_len = 250

vec_layer = layers.TextVectorization(standardize=standardize, max_tokens=max_features, output_mode='int', output_sequence_length=seq_len)

In [33]:
def vec_text(text, label):
    text = tensorflow.expand_dims(text, -1)
    return vec_layer(text), label

In [34]:
train_text = raw_train_ds.map(lambda x, y: x)
vec_layer.adapt(train_text)

In [35]:
AUTOTUNE=tensorflow.data.AUTOTUNE

train_ds = raw_train_ds.map(vec_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vec_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vec_text).cache().prefetch(buffer_size=AUTOTUNE)

In [36]:
embedding_dim = 16
model = models.Sequential([
  layers.Embedding(max_features, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 16)          160000    
                                                                 
 dropout_2 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160017 (625.07 KB)
Trainable params: 160017 (625.07 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [37]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=metrics.BinaryAccuracy(threshold=0.0))

In [38]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10


2026-01-02 11:16:11.035808: I external/local_xla/xla/service/service.cc:168] XLA service 0x71577c0c12d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-02 11:16:11.035831: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2026-01-02 11:16:11.040934: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2026-01-02 11:16:11.060164: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906
I0000 00:00:1767352571.117180    2704 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
model.evaluate(test_ds)



[0.33049070835113525, 0.8658000230789185]

In [46]:
model2 = models.Sequential([
    layers.Embedding(max_features, embedding_dim),
    layers.Bidirectional(layers.LSTM(32)), 
    layers.Dropout(0.5),
    layers.Dense(1)
])

model2.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=metrics.BinaryAccuracy(threshold=0.0))

In [47]:
history = model2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
model2.evaluate(test_ds)



[0.6912481784820557, 0.8313999772071838]

In [51]:
model3 = models.Sequential([
    layers.Embedding(max_features, embedding_dim),
    layers.Conv1D(32, 7, activation='relu'), # Fast like DNN, smart like RNN
    layers.GlobalMaxPooling1D(),
    layers.Dense(1)
])

model3.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=metrics.BinaryAccuracy(threshold=0.0))

history = model3.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [52]:
model3.evaluate(test_ds)



[0.37092533707618713, 0.86080002784729]