In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
from keras import layers

import string
import re

In [None]:
#   Cell 2: Text Preprocessing Functions
#   Custom text standardization for preprocessing. 
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data) # Convert to lowercase
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ") # Replace HTML line breaks with spaces
    #   Remove all punctuation
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

#   Function to convert text to vector
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
#   Cell 3: Model Generation Function
def generate_model(epochs):
    inputs = keras.Input(shape=(None,), dtype="int64")

    #   Embedding layer
    x = layers.Embedding(max_features, embedding_dim)(inputs)

    #   Dropout layer
    x = layers.Dropout(0.5)(x)

    #   Convolutional layers
    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)

    #   Global max pooling layer: reduce dimensionality
    x = layers.GlobalMaxPooling1D()(x)

    #   Dense layer with ReLU activation
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.5)(x)

    #   Output layer with sigmoid activation for binary classification
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    #   Create the Keras model
    model = keras.Model(inputs, predictions)

    #   Compile the model and configure training parameters
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(train_ds, validation_data=val_ds, epochs=epochs)

    #Calculate loss/accuracy and print result
    results = model.evaluate(test_ds)
    print("test loss, test acc:", results)


    return model

In [None]:
#   Cell 4: Main training loop

#   Train the models for different balance types in the datasets
for balance_type in ['unbalanced', 'balanced']:
    batch_size = 32

    #   Load training, validation and test data from directory structure
    raw_train_ds = keras.utils.text_dataset_from_directory(
        f"../data_formatted/{balance_type}/train",
        batch_size=batch_size,
        validation_split=0.2,
        subset="training",
        seed=1337,
    )
    raw_val_ds = keras.utils.text_dataset_from_directory(
        f"../data_formatted/{balance_type}/train",
        batch_size=batch_size,
        validation_split=0.2,
        subset="validation",
        seed=1337,
    )
    raw_test_ds = keras.utils.text_dataset_from_directory(
        f"../data_formatted/{balance_type}/test", batch_size=batch_size
    )

    max_features = 20000    #   Maximum number of tokens in the vocabulary
    embedding_dim = 128     #   Dimension of the embedding vectors
    sequence_length = 500 #   Maximum length of input sequences

    #   Create a vectorization layer
    vectorize_layer = keras.layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )

    #   Create a text dataset from the training data
    text_ds = raw_train_ds.map(lambda x, y: x)

    #   Adapt the vectorization layer to the text dataset
    vectorize_layer.adapt(text_ds)

    #   Apply vectorization to all datasets
    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)

    #   Optimize by caching and prefetching
    train_ds = train_ds.cache().prefetch(buffer_size=10)
    val_ds = val_ds.cache().prefetch(buffer_size=10)
    test_ds = test_ds.cache().prefetch(buffer_size=10)

    #   Train the model for different number of epochs
    for epochs in [3, 6, 10, 12, 20]:
        print(f'Training {balance_type} model with {epochs} epochs...')
        model = generate_model(epochs)
        model.save(f'../model/ahole-model_{balance_type}_{epochs}-epochs.keras')


Found 305636 files belonging to 2 classes.
Using 244509 files for training.
Found 305636 files belonging to 2 classes.
Using 61127 files for validation.
Found 76410 files belonging to 2 classes.
Training unbalanced model with 6 epochs...
Epoch 1/6
[1m7641/7641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 30ms/step - accuracy: 0.7954 - loss: 0.4883 - val_accuracy: 0.7959 - val_loss: 0.4737
Epoch 2/6
[1m7641/7641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 30ms/step - accuracy: 0.7986 - loss: 0.4642 - val_accuracy: 0.7967 - val_loss: 0.4712
Epoch 3/6
[1m7641/7641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 32ms/step - accuracy: 0.8028 - loss: 0.4463 - val_accuracy: 0.7970 - val_loss: 0.4772
Epoch 4/6
[1m3211/7641[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m2:24[0m 33ms/step - accuracy: 0.8075 - loss: 0.4313

KeyboardInterrupt: 

In [None]:
#   Cell 5: Model Evaluation

import csv
add = []
add.append(["Balance Type", "epochs", "loss", "accuracy"])

for balance_type in ['unbalanced', 'balanced']: 
        batch_size = 32
        raw_train_ds = keras.utils.text_dataset_from_directory(
                f"../data_formatted/{balance_type}/train",
                batch_size=batch_size,
                validation_split=0.2,
                subset="training",
                seed=1337,
        )
        raw_val_ds = keras.utils.text_dataset_from_directory(
                f"../data_formatted/{balance_type}/train",
                batch_size=batch_size,
                validation_split=0.2,
                subset="validation",
                seed=1337,
        )
        raw_test_ds = keras.utils.text_dataset_from_directory(
                f"../data_formatted/{balance_type}/test", batch_size=batch_size
        )

        max_features = 20000
        embedding_dim = 128
        sequence_length = 500

        vectorize_layer = keras.layers.TextVectorization(
                standardize=custom_standardization,
                max_tokens=max_features,
                output_mode="int",
                output_sequence_length=sequence_length,
        )

        text_ds = raw_train_ds.map(lambda x, y: x)

        vectorize_layer.adapt(text_ds)

        # train_ds = raw_train_ds.map(vectorize_text)
        # val_ds = raw_val_ds.map(vectorize_text)
        # test_ds = raw_test_ds.map(vectorize_text)

        # train_ds = train_ds.cache().prefetch(buffer_size=10)
        # val_ds = val_ds.cache().prefetch(buffer_size=10)
        test_ds = test_ds.cache().prefetch(buffer_size=10)

        #   Evaluate each saved model
        for epochs in [3, 6, 10, 12, 20]:
                print(f'Testing {balance_type} model with {epochs} epochs...')
               
                model = tf.keras.models.load_model(f'../model/ahole-model_{balance_type}_{epochs}-epochs.keras')
                #   Evaluate the model on test dataset
                results = model.evaluate(test_ds)
                add.append([balance_type, epochs, results[0], results[1]])
                #   Calculate loss/accuracy and print result
                print("test loss, test acc:", results)

Found 305636 files belonging to 2 classes.
Using 244509 files for training.
Found 305636 files belonging to 2 classes.
Using 61127 files for validation.
Found 76410 files belonging to 2 classes.
Testing unbalanced model with 3 epochs...
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - accuracy: 0.7964 - loss: 0.4846
test loss, test acc: [0.4845568537712097, 0.7964010238647461]
Testing unbalanced model with 6 epochs...
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.7944 - loss: 0.5257
test loss, test acc: [0.5256925821304321, 0.7944117188453674]
Testing unbalanced model with 10 epochs...
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.7828 - loss: 0.5828
test loss, test acc: [0.582832396030426, 0.7827901840209961]
Testing unbalanced model with 12 epochs...
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.7861 - loss: 0.7414
te

In [None]:
#   Cell 6: Export Results to CSV

if os.path.exists('model-stats_loss_accuracy.csv'):
    with open('model-stats_loss_accuracy.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for row in add:
                    writer.writerow(row)

model = tf.keras.models.load_model('../model/ahole-model_unbalanced_3-epochs.keras')
#Calculate loss/accuracy and print result
results = model.evaluate(test_ds)
print("test loss, test acc:", results)