In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
from keras import layers
from keras import ops

import string
import re

import pandas as pd
import random

In [2]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

In [3]:
# chatgpt suggested function, see saved chat
def dataset_to_numpy(dataset):
    xs = []
    ys = []

    for x_batch, y_batch in dataset:
        xs.extend(x_batch.numpy())
        ys.extend(y_batch.numpy())

    return (
        np.array(xs, dtype=object),
        np.array(ys)
    )

In [None]:
#   Text Preprocessing Functions
#   Custom text standardization for preprocessing. 

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
@keras.saving.register_keras_serializable()
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        #   Multi-head attention layer
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        #   Feed forward network layer
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        #   Layer normalization layers
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        #   Dropout layers
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [None]:
# Token and positional embedding class
@keras.saving.register_keras_serializable()
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        # maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [7]:
test_set_size = 5000

In [None]:
# Functions to manually predict given a specific text

def manual_predict(man_text, model):
    try: 
        vec_text = vectorize_layer(tf.constant([man_text]))
        return model.predict(vec_text, verbose=0)
    except:
        print(f'Prediction failed on {man_text}')
        return None

def manual_odds(man_text, model):
    result = manual_predict(man_text, model)
    if result is None:
        return None
    else:
        return result.tolist()[0][0]

# to be filled in with our appropriate labels
def manual_bin(man_text):
    if manual_odds(man_text) >= 0.5:
        return 'positive'
    else:
        return 'negative'

In [None]:
# test the model on test set that has been loaded into the df dataframe
def test_model(test_set, model):
    df['predicted_odds'] = df['raw_text'].apply(lambda text: manual_odds(text, model))
    df['prediction'] = df['predicted_odds'].apply(lambda x: 'TA' if x >= 0.5 else 'NTA')
    df['is_correct'] = df['prediction'] == df['correct']
    return len(df[df['is_correct']]) / len(df)

In [10]:
def read_file(file):
    try:
        with open(file, 'r') as in_file:
            text = in_file.read()
        return text
    except:
        return None

In [None]:
# prepare the vectorization step

batch_size = 32
raw_train_ds, raw_test_ds = keras.utils.text_dataset_from_directory(
        "../data_transformer/unbalanced",
        batch_size=batch_size,
        seed=1337,
        subset="both",
        validation_split=0.2,
        labels="inferred"
    )

Found 382046 files belonging to 2 classes.
Using 305637 files for training.
Using 76409 files for validation.


In [12]:
max_features = 20000
embedding_dim = 128
sequence_length = 200

vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = raw_train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)

train_ds = raw_train_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

2025-12-16 10:06:59.049582: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [13]:
train_x, train_y = dataset_to_numpy(train_ds)

2025-12-16 10:07:12.595862: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
test_x, test_y = dataset_to_numpy(test_ds)

In [15]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

train_x = keras.utils.pad_sequences(train_x, maxlen=maxlen)
test_x = keras.utils.pad_sequences(test_x, maxlen=maxlen)

In [16]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

In [None]:
# load and clean the test data

pos_files = [f'../data_formatted/balanced/test/pos/{file}' for file in os.listdir('../data_formatted/balanced/test/pos')]
neg_files = [f'../data_formatted/balanced/test/neg/{file}' for file in os.listdir('../data_formatted/balanced/test/neg')]

In [18]:
neg_df = pd.DataFrame({
    'file': neg_files,
    'correct': 'NTA'
})

pos_df = pd.DataFrame({
    'file': pos_files,
    'correct': 'TA'
})

df = pd.concat([neg_df, pos_df]).reset_index(drop=True)

In [19]:
df['raw_text'] = [read_file(file) for file in df['file']]

In [20]:
# quick validation
# remove for full validation
sample_rows = random.sample(list(df.index), test_set_size)
df = df.iloc[sample_rows]

In [21]:
models = [f'../model/{model}' for model in os.listdir('../model') if 'transformer' in model]

In [22]:
model_stats = pd.DataFrame({
    'model': models,
    'accuracy_rate': [0] * len(models)
})

In [None]:
for i in range(0, len(model_stats)):
    curr_model = model_stats['model'][i]

    model = keras.models.load_model(curr_model)
    accuracy_rate = test_model(df, model)
    print(f'Model {curr_model} has accuracy {accuracy_rate}')
    model_stats.loc[i, 'accuracy_rate'] = accuracy_rate



Prediction failed on None
Model ../model/transformer_balanced_3-epochs.keras has accuracy 0.49598700554175423


  model_stats.loc[i, 'accuracy_rate'] = accuracy_rate


Prediction failed on For context I was in a band, (I was given the cold shoulder because of this and left) it consisted of three guys I didn't know to well, me and another girl who I knew from school bands we had been in together. She was a singer and they wanted something that would make them stand out (I play the flute), so they asked me to kinda just see if I fit. I said sure but a classic sting instrument would be easier to incorporate with a wind.  Long story short I'm in and it's okay we all get along. (Music wasn't great but not the issue.) One day the singer I'll call T came up to me and said, "you know it makes me feel really insecure about my chest when you wear things like that." I wear corsets or corset like tops or a regular top but with my waist trainer which honestly looks great, lots of chains or tight skirts and pants. Its just how I dress normally. Now I know I have very large beasts and I'm the opposite of a petite woman so the waist training helped me start to feel 

In [35]:
model_stats.sort_values('accuracy_rate', ascending=False)

Unnamed: 0,model,accuracy_rate
0,../model/transformer_balanced_3-epochs.keras,0.515
8,../model/transformer_balanced_6-epochs.keras,0.504
5,../model/transformer_balanced_12-epochs.keras,0.5
2,../model/transformer_balanced_10-epochs.keras,0.492
1,../model/transformer_unbalanced_6-epochs.keras,0.483
4,../model/transformer_balanced_20-epochs.keras,0.483
6,../model/transformer_unbalanced_3-epochs.keras,0.476
3,../model/transformer_unbalanced_10-epochs.keras,0.466
7,../model/transformer_unbalanced_12-epochs.keras,0.465


In [36]:
model_stats.to_csv(f'../validation-stats/transformer-stats_test-set-{test_set_size}.csv', index=False)