In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
from keras import layers
from keras import ops
import tensorflow_datasets as tfds

import string
import re

In [2]:
# chatgpt suggested function, see saved chat
def dataset_to_numpy(dataset):
    xs = []
    ys = []

    for x_batch, y_batch in dataset:
        xs.extend(x_batch.numpy())
        ys.extend(y_batch.numpy())

    return (
        np.array(xs, dtype=object),
        np.array(ys)
    )

In [None]:
keras.saving.get_custom_objects().clear() 

In [None]:
@keras.saving.register_keras_serializable()
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        #   Multi-head attention layer
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        
        #   Feed forward network layer
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )

        #   Layer normalization layers
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        #   Dropout layers
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

        #   Embedding dimension, number of heads and feed forward dimension as parameters
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        return {
                'embed_dim': self.embed_dim,
                'num_heads': self.num_heads,
                'ff_dim': self.ff_dim
            }

In [None]:
# Token and positional embedding class

@keras.saving.register_keras_serializable()
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.maxlen = maxlen

        #   Token embedding layer
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)

        #   Positional embedding layer
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, x):
        # maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        return {
            'maxlen': self.maxlen,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim
        }

In [None]:
#   Text Preprocessing Functions
#   Custom text standardization for preprocessing. 

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
#   Function to create, compile, train and return a transformer model

def generate_model(epochs):
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.fit(
        train_x, train_y, batch_size=32, epochs=epochs, validation_data=(test_x, test_y)
    )

    return model

In [None]:
batch_size = 32

#   Build the training and test datasets
raw_train_ds, raw_test_ds = keras.utils.text_dataset_from_directory(
        "../data_transformer/unbalanced",
        batch_size=batch_size,
        seed=1337,
        subset="both",
        validation_split=0.2,
        labels="inferred"
    )

Found 382046 files belonging to 2 classes.
Using 305637 files for training.
Using 76409 files for validation.


In [None]:
#   Text vectorization

max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = raw_train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)

train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

2025-12-15 18:15:38.364563: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
train_x, train_y = dataset_to_numpy(train_ds) # Convert the training dataset to numpy arrays

2025-12-15 18:15:52.205938: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
test_x, test_y = dataset_to_numpy(test_ds) # Convert the test dataset to numpy arrays

In [None]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

#   Pad the sequences to the same length
train_x = keras.utils.pad_sequences(train_x, maxlen=maxlen)
test_x = keras.utils.pad_sequences(test_x, maxlen=maxlen)

In [None]:
# Build the transformer model

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

In [None]:
# Train the model in different epochs

for ep in [6, 10 , 12]:
    print(f'Training transformer with {ep} epochs...')
    model = generate_model(ep)
    model.save(f'../model/transformer_unbalanced_{ep}-epochs.keras')

Training transformer with 6 epochs...
Epoch 1/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 36ms/step - accuracy: 0.7947 - loss: 0.4995 - val_accuracy: 0.7939 - val_loss: 0.4959
Epoch 2/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1113s[0m 117ms/step - accuracy: 0.7948 - loss: 0.4864 - val_accuracy: 0.7942 - val_loss: 0.4968
Epoch 3/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 55ms/step - accuracy: 0.7968 - loss: 0.4775 - val_accuracy: 0.7912 - val_loss: 0.5001
Epoch 4/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 68ms/step - accuracy: 0.7997 - loss: 0.4686 - val_accuracy: 0.7881 - val_loss: 0.5042
Epoch 5/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m654s[0m 68ms/step - accuracy: 0.8028 - loss: 0.4606 - val_accuracy: 0.7908 - val_loss: 0.5046
Epoch 6/6
[1m9552/9552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 73ms/step - accuracy: 0.8060 - loss: 0.4526 - val_