# Assignment 2: Comparing Tokenization Methods and using Hyperparameter Optimization for IMDB Classification

### Maxim Ryabinov (U02204083)
### CAP4641: Natural Language Processing 
### Instructor: Dr. Ankur Mali 
### University of South Florida (Spring 2025)

---

# 1. Initial Setup
- Imports the necessary libraries and sets random seeds to ensure reproducibility later on.
- Defines a set of storage classes that will be helpful during model testing.
- A few tokenizer and preprocessing utility functions are also defined here.


In [4]:
import os # Suppress TensorFlow warnings.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Imports all libraries used within the notebook.
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import random

# Sets the seeds for reproducibility.
tf.random.set_seed(1234)
np.random.seed(1234)


# --------------------
# Data Storage Classes
# --------------------
# Stores a set of hyperparameters, as well as creates random sample hyperparameters.
class Hyperparameters():    
    def __init__(self, learning_rate=None, hidden_layers=None, hidden_sizes=None, batch_size=None, optimizer=None, activation_function=None):
        self.learning_rate = learning_rate
        self.hidden_layers = hidden_layers
        self.hidden_sizes = hidden_sizes
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.activation_function = activation_function
    
    def __str__(self):
        return (f"Hyperparameters:\n"
                f"Learning Rate: {self.learning_rate:.4f} | Num Layers: {self.hidden_layers} | "
                f"Hidden Sizes: {self.hidden_sizes} | Batch Size: {self.batch_size} | "
                f"Optimizer: {self.optimizer.__name__} | Activation Function: {self.activation_function.__name__}\n")
    
    @staticmethod
    def get_hyperparameter_choices():
        # Returns a dictionary of possible hyperparameter values.
        return {
            "learning_rates": [0.001, 0.0005, 0.0001],
            "hidden_layers": [1, 2, 3],
            "hidden_sizes": [128, 256, 512],
            "batch_sizes": [32, 64, 128],
            "optimizers": [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD, tf.keras.optimizers.RMSprop],
            "activation_functions": [tf.nn.relu, tf.nn.tanh, tf.nn.leaky_relu]
        }
        
    def sample_random(self):        
        choices = self.get_hyperparameter_choices()
        self.learning_rate = random.choice(choices["learning_rates"])
        self.hidden_layers = random.choice(choices["hidden_layers"])
        picked_size = random.choice(choices["hidden_sizes"])
        self.hidden_sizes = [picked_size] * self.hidden_layers
        self.batch_size = random.choice(choices["batch_sizes"])
        self.optimizer = random.choice(choices["optimizers"])
        self.activation_function = random.choice(choices["activation_functions"])

# Stores the performance metrics of a model.
class PerformanceMetrics():
    def __init__(self, loss, accuracy, precision, recall):
        self.loss = loss
        self.accuracy = accuracy
        self.precision = precision
        self.recall = recall
    
    def __str__(self):
        return (f"Loss: {self.loss:.4f} | Accuracy: {self.accuracy:.4f} | "
                f"Precision: {self.precision:.4f} | Recall: {self.recall:.4f}\n")

# Stores the hyperparameters and performance metrics of a model.
class Result():
    def __init__(self, hyperparameters, performance_metrics):
        self.hyperparameters = hyperparameters
        self.performance_metrics = performance_metrics
        
    def __str__(self):
        return (f"Hyperparameters:\n"
                f"Learning Rate: {self.hyperparameters.learning_rate:.4f} | Num Layers: {self.hyperparameters.hidden_layers} | "
                f"Hidden Sizes: {self.hyperparameters.hidden_sizes} | Batch Size: {self.hyperparameters.batch_size} | "
                f"Optimizer: {self.hyperparameters.optimizer.__name__} | Activation Function: {self.hyperparameters.activation_function.__name__}\n"
                f"\nPerformance Metrics:\n"
                f"Loss: {self.performance_metrics.loss:.4f} | Accuracy: {self.performance_metrics.accuracy:.4f} | "
                f"Precision: {self.performance_metrics.precision:.4f} | Recall: {self.performance_metrics.recall:.4f}\n")


# ------------------------------------------
# Tokenizer and Preprocessing Util Functions
# ------------------------------------------
# Creates a fitted tokenizer object, can either be character or word level.
def create_tokenizer(texts, is_char_level, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, char_level=is_char_level, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

# Converts texts to a bag-of-words representation.
def texts_to_bow(tokenizer, texts):
    # Texts_to_matrix with mode 'binary' produces a fixed-length binary vector per text.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix # Binary bag of characters/words.

# Converts texts to a sequence of integers.
def one_hot_encode(labels, num_classes=2):
    return np.eye(num_classes)[labels]

# 2. Defining the MLP Model
This is where the main MLP model is defined for text classification on the IMDB dataset.

It is able to take a range of parameters including:
- The number of features: `size_input`
- The number of hidden layers: `num_hidden`
- A list of sizes (# of neurons) in each layer: `hidden_sizes`
- The size of the output layer: `output_size`
- Amount of training samples used: `batch_size`
- The optimizer's learning rate: `learning_rate`
- Optimizer being used: `optimizer`
- Activation Function used: `act_func`
- (Optionally) Device being used: `device`

Additionally, the model contains methods for forward and back propagation, calculating the loss, and computing the final output.

In [53]:
# --------------------
# MLP Class Definition
# --------------------
class MLP(object):
    def __init__(self, size_input, num_hidden, hidden_sizes, size_output, batch_size, optimizer, learning_rate, act_func, device=None):
        self.size_input = size_input # Number of features
        self.num_hidden = num_hidden # Number of hidden layers
        self.hidden_sizes = hidden_sizes # List of hidden layer sizes
        self.size_output = size_output # Number of classes (Binary Classification)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.optimizer = optimizer # Optimizer class (e.g. Adam, SGD, RMSprop)
        self.act_func = act_func # Activation function to use (e.g. ReLU, Tanh, Leaky ReLU)
        self.device = device # Device to run the model on (cpu or gpu)
        
        # Holds the model's weights and biases.
        self.variables = []
        
        # Initializes the model's weights and biases.
        w = tf.Variable(tf.random.normal([self.size_input, self.hidden_sizes[0]], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.hidden_sizes[0]]))
        self.variables.append((w, b))
        
        # Initializes the weights and biases for the hidden layers.
        for i in range(1, self.num_hidden):
            w = tf.Variable(tf.random.normal([self.hidden_sizes[i-1], self.hidden_sizes[i]], stddev=0.1))
            b = tf.Variable(tf.zeros([1, self.hidden_sizes[i]]))
            self.variables.append((w, b))
        
        # Initializes the weights and biases for the output layer.
        w = tf.Variable(tf.random.normal([self.hidden_sizes[-1], self.size_output], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.size_output]))
        self.variables.append((w, b))

    # Computes the output tensor during the forward pass. X is the input tensor.
    def forward(self, X):
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    # Computes the loss value given the predicted and actual output tensors.
    def loss(self, y_pred, y_true):
        # Tensor of shape (batch_size, size_output)
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    # Computes and applies gradients of the loss with respect to the variables (weights and bias).
    def backward(self, X_train, y_train):
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        
        trainable_vars = [var for w, b in self.variables for var in (w, b)]    
        
        grads = tape.gradient(current_loss, trainable_vars)
        
        optimizer = self.optimizer(learning_rate=self.learning_rate)
        optimizer.apply_gradients(zip(grads, trainable_vars))

    # Custom method to compute the output tensor during the forward pass.
    def compute_output(self, X):  
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        z = X_tf
        
        # Goes through each hidden layer and applies the weights and biases
        for w, b in self.variables:
            z = tf.matmul(z, w) + b
            
            # If the current layer is the last layer, then its the output layer.
            if (w, b) == self.variables[-1]:
                return z
            
            z = self.act_func(z)

# 3. Training the Model (Train Loop)

This function simply defines the training process for the model. It will train the model multiple times depending on the number of epochs (three by default).

The function also outputs relevant data to the training process:
- Current Epoch
- Training Loss
- Validation Loss
- Accuracy
- Precision
- Recall

In [3]:
def train_model(model, X_train, X_val, y_train, y_val, epochs = 3):
    num_batches = int(np.ceil(X_train.shape[0] / model.batch_size))

    print("\nStarting training...\n")
    for epoch in range(epochs):
        # Shuffle training data at the start of each epoch.
        indices = np.arange(X_train.shape[0])
        np.random.shuffle(indices)
        X_train = X_train[indices]
        y_train = y_train[indices]

        epoch_loss = 0
        for i in range(num_batches):
            start = i * model.batch_size
            end = min((i+1) * model.batch_size, X_train.shape[0])
            X_batch = X_train[start:end]
            y_batch = y_train[start:end]

            # Compute gradients and update weights.
            predictions = model.forward(X_batch)
            loss_value = model.loss(predictions, y_batch)
            model.backward(X_batch, y_batch)

            epoch_loss += loss_value.numpy() * (end - start)

        epoch_loss /= X_train.shape[0]

        # Evaluate on validation set.
        val_logits = model.forward(X_val)
        val_loss = model.loss(val_logits, y_val).numpy()
        val_preds = np.argmax(val_logits.numpy(), axis=1)
        true_val = np.argmax(y_val, axis=1)
        accuracy = np.mean(val_preds == true_val)
        precision = precision_score(true_val, val_preds)
        recall = recall_score(true_val, val_preds)

        print(f"Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
            f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

# 4. Evaluation on Test Set (Test Loop)

This function takes a trained model and evaluates its performance based on a test portion of the IMDB dataset.

The function also outputs the following performance metrics:
- Loss
- Accuracy
- Precision
- Recall

In [4]:
def test_model(model, X_test, y_test):
    # -------------------------------
    # Final Evaluation on Test Set
    # -------------------------------
    print("\nEvaluating on test set...")
    test_logits = model.forward(X_test)
    test_loss = model.loss(test_logits, y_test).numpy()
    test_preds = np.argmax(test_logits.numpy(), axis=1)
    true_test = np.argmax(y_test, axis=1)
    test_accuracy = np.mean(test_preds == true_test)
    test_precision = precision_score(true_test, test_preds)
    test_recall = recall_score(true_test, test_preds)

    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
        f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")
    
    return test_loss, test_accuracy, test_precision, test_recall

# 5. Loading IMDB Dataset and Tokenizing
This cell loads the IMDB dataset into the notebook.

It performs both character-level tokenization and word-level tokenization.

The output below logs data allocation for the training and test datasets, as well as information on vocabulary sizes.

In [5]:
# ---------------------------------
# Load and Prepare the IMDB Dataset
# ---------------------------------
print("Loading IMDB dataset...\n")
# Load the IMDB reviews dataset with the 'as_supervised' flag so that we get (text, label) pairs.
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
                                           split=['train', 'test'],
                                           as_supervised=True,
                                           with_info=True)

# Convert training dataset to lists.
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    # Decode byte strings to utf-8 strings.
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create a validation set from the training data (20% for validation).
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert test dataset to lists.
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print("IMBD Dataset loaded:")
print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}\n")


# ---------------------------------------------
# Preprocessing: Tokenization and Vectorization
# ---------------------------------------------
print("Tokenizing texts...")
# Build the character-level tokenizer on the training texts.
char_tokenizer = create_tokenizer(train_texts, is_char_level=True)
print("Char tokenizer vocabulary size:", len(char_tokenizer.word_index) + 1)

word_tokenizer = create_tokenizer(train_texts, is_char_level=False, num_words=10000) # Limited to the top 10,000 common words.
print("Word tokenizer vocabulary size:", len(word_tokenizer.word_index) + 1, "\n")

print("Converting texts to bag-of-characters...")
# Convert texts to bag-of-characters representation.
X_train_char = texts_to_bow(char_tokenizer, train_texts)
X_val_char   = texts_to_bow(char_tokenizer, val_texts)
X_test_char  = texts_to_bow(char_tokenizer, test_texts)

print("Converting texts to bag-of-words...")
# Convert texts to bag-of-words representation.
X_train_word = texts_to_bow(word_tokenizer, train_texts)
X_val_word   = texts_to_bow(word_tokenizer, val_texts)
X_test_word  = texts_to_bow(word_tokenizer, test_texts)

print("Performing one-hot encoding on labels...\n")
# Convert labels to one-hot encoding.
y_train = one_hot_encode(train_labels)
y_val   = one_hot_encode(val_labels)
y_test  = one_hot_encode(test_labels)
print("Preprocessing complete.")


Loading IMDB dataset...

IMBD Dataset loaded:
Train samples: 20000, Validation samples: 5000, Test samples: 25000

Tokenizing texts...
Char tokenizer vocabulary size: 134
Word tokenizer vocabulary size: 80169 

Converting texts to bag-of-characters...
Converting texts to bag-of-words...
Performing one-hot encoding on labels...

Preprocessing complete.


# Character Tokenization Vs. Word Tokenization
This is where character-level and word-level tokenization is compared.

Both models are created, and then trained three times each for robustness. Validation metrics are also logged.

Based on the performance of both models (accuracy), word-level tokenization performs marginally better than character-level tokenization.

In [52]:
# Set seeds for reproducibility.
tf.random.set_seed(1234)
np.random.seed(1234)

# -----------
# Model Setup
# -----------
# The input size is determined by the dimension of the bag-of-characters vector.
char_size_input = X_train_char.shape[1]
word_size_input = X_train_word.shape[1]

# Parameters used for both models.
num_hidden = 3 # Number of hidden layers
hidden_sizes = [128, 64, 32] # Size of each hidden layer
batch_size = 128
optimizer = tf.keras.optimizers.Adam
learning_rate = 0.001
act_func = tf.nn.relu # Activation function used
size_output  = 2 # Binary classification

# Create the character-level and word-level models.
char_model = MLP(size_input=char_size_input,
            num_hidden=num_hidden,
            hidden_sizes=hidden_sizes,
            size_output=size_output,
            batch_size=batch_size,
            optimizer=optimizer,
            learning_rate=learning_rate,
            act_func=act_func,
            device=None)

word_model = MLP(size_input=word_size_input,
            num_hidden=num_hidden,
            hidden_sizes=hidden_sizes,
            size_output=size_output,
            batch_size=batch_size,
            optimizer=optimizer,
            learning_rate=learning_rate,
            act_func=act_func,
            device=None)

# --------------------------------
# Training and Evaluation (Testing)
# --------------------------------

print("\nTraining the character-level model...")
train_model(char_model, X_train_char, X_val_char, y_train, y_val)
test_model(char_model, X_test_char, y_test)

print("\n-----------------------------------------------------------------------------------------------------------------------")

print("\nTraining the word-level model...")
train_model(word_model, X_train_word, X_val_word, y_train, y_val)
test_model(word_model, X_test_word, y_test)


Training the character-level model...

Starting training...

Epoch 01 | Training Loss: 0.6755 | Val Loss: 0.6853 | Accuracy: 0.5690 | Precision: 0.5351 | Recall: 0.8449
Epoch 02 | Training Loss: 0.6696 | Val Loss: 0.6654 | Accuracy: 0.6040 | Precision: 0.6076 | Recall: 0.5173
Epoch 03 | Training Loss: 0.6660 | Val Loss: 0.6642 | Accuracy: 0.6020 | Precision: 0.6073 | Recall: 0.5066

Evaluating on test set...
Test Loss: 0.6656 | Test Accuracy: 0.6004 | Test Precision: 0.6217 | Test Recall: 0.5126

-----------------------------------------------------------------------------------------------------------------------

Training the word-level model...

Starting training...

Epoch 01 | Training Loss: 0.4478 | Val Loss: 0.3239 | Accuracy: 0.8728 | Precision: 0.8449 | Recall: 0.9035
Epoch 02 | Training Loss: 0.2638 | Val Loss: 0.3049 | Accuracy: 0.8806 | Precision: 0.9069 | Recall: 0.8399
Epoch 03 | Training Loss: 0.2221 | Val Loss: 0.3365 | Accuracy: 0.8764 | Precision: 0.9228 | Recall: 0.8

(np.float32(0.35722524), np.float64(0.86696), 0.9303809345092888, 0.79328)

# Hyper-parameter Optimization

This is where the process of hyper-parameter optimization occurs. The model is run through 50 different hyper-parameter configuations, each one being trained multiple times for robustness.

The results for each set of hyper-parameters is then saved into the data storage classes defined during the intial setup so they can be more easily processed.

In [8]:
# Set seeds for reproducibility.
tf.random.set_seed(1234)
np.random.seed(1234)


# -----------
# Model Setup
# -----------
# The input size is determined by the dimension of the bag-of-characters vector.
size_input = X_train_word.shape[1]
size_output  = 2 # Binary Classification
results = [] # Will contain the results of each run.

combinations = 50 # Number of random hyperparameter combinations to try.

for i in range(combinations):
    # Gets a new set of random parameters.
    params = Hyperparameters()
    params.sample_random()
    
    # Logs the parameters
    print(f"\nRun {i+1}/{combinations}")
    print(params)

    # Creates a new model with the random parameters.
    model = MLP(size_input = size_input,
                num_hidden = params.hidden_layers,
                hidden_sizes = params.hidden_sizes,
                size_output = size_output,
                batch_size = params.batch_size,
                optimizer = params.optimizer,
                learning_rate = params.learning_rate,
                act_func = params.activation_function,
                device=None)

    # Trains the model, tests it, and logs the results.
    train_model(model, X_train_word, X_val_word, y_train, y_val, epochs=3)
    pm = PerformanceMetrics(*test_model(model, X_test_word, y_test))
    results.append(Result(params, pm))

    print("\n-----------------------------------------------------------------------------------------------------------------------")
    
# Displays the best performing model (based on accuracy).
best_result = max(results, key=lambda r: r.performance_metrics.accuracy)

print("\nBest performing model:")
print(best_result)


Run 1/50
Hyperparameters:
Learning Rate: 0.0005 | Num Layers: 3 | Hidden Sizes: [256, 256, 256] | Batch Size: 32 | Optimizer: Adam | Activation Function: relu


Starting training...

Epoch 01 | Training Loss: 0.5107 | Val Loss: 0.4811 | Accuracy: 0.8502 | Precision: 0.8042 | Recall: 0.9134
Epoch 02 | Training Loss: 0.4700 | Val Loss: 0.5042 | Accuracy: 0.8736 | Precision: 0.8604 | Recall: 0.8824
Epoch 03 | Training Loss: 0.5285 | Val Loss: 0.7956 | Accuracy: 0.8618 | Precision: 0.8083 | Recall: 0.9373

Evaluating on test set...
Test Loss: 0.7898 | Test Accuracy: 0.8595 | Test Precision: 0.8157 | Test Recall: 0.9289

-----------------------------------------------------------------------------------------------------------------------

Run 2/50
Hyperparameters:
Learning Rate: 0.0001 | Num Layers: 1 | Hidden Sizes: [128] | Batch Size: 128 | Optimizer: SGD | Activation Function: relu


Starting training...

Epoch 01 | Training Loss: 0.7807 | Val Loss: 0.7620 | Accuracy: 0.5352 | Precisio

# Checking Robustness of Model with Best Parameters

So using the technique above for hyper-parameter optimization, I have found that the following are the best parameters for the word-tokenization model:
- **Learning Rate**: `0.001`
- **Number of Layers**: `1`
- **Hidden Layer Size**: `256`
- **Batch Size**: `32`
- **Optimizer**: `Adam`
- **Activation Function**: `Tanh`

In order to further confirm this however, a final robustness check can be done.

The following code below runs the model with the best found parameters multiple times, and reports the mean accuracy and standard error across all runs:

In [49]:
# Ensures seeds are now random.
tf.random.set_seed(None)
np.random.seed(None)


size_input = X_train_word.shape[1]

# Best parameters found from random search.
num_hidden = 1 # Number of hidden layers
hidden_sizes = [256] # Size of each hidden layer
batch_size = 32
optimizer = tf.keras.optimizers.Adam
learning_rate = 0.001
act_func = tf.nn.tanh # Activation function used
size_output  = 2 # Binary classification

results = []
experiment_runs = 5

for i in range(experiment_runs):
    print(f"\nExperiment Run {i+1}/{experiment_runs}")
    
    model = MLP(size_input=size_input,
                num_hidden=num_hidden,
                hidden_sizes=hidden_sizes,
                size_output=size_output,
                batch_size=batch_size,
                optimizer=optimizer,
                learning_rate=learning_rate,
                act_func=act_func,
                device=None)
    
    #Trains the model, tests it, and logs the results.
    train_model(model, X_train_word, X_val_word, y_train, y_val, epochs=3)
    pm = PerformanceMetrics(*test_model(model, X_test_word, y_test))
    results.append(Result(params, pm))

    print("\n-----------------------------------------------------------------------------------------------------------------------")

# Logs a summary of the accuracy from each run.
print("\nSummary of Experiment Runs:")

accuracies = [r.performance_metrics.accuracy for r in results]
for i, result in enumerate(results):
    print(f"Run {i+1}: Accuracy = {accuracies[i]:.4f}")

# Computes mean accuracy and standard error.
mean_accuracy = np.mean(accuracies)
standard_error = np.std(accuracies) / np.sqrt(len(accuracies))

print(f"\nMean Accuracy: {mean_accuracy:.4f}")
print(f"Standard Error: {standard_error:.4f}")


Experiment Run 1/5

Starting training...

Epoch 01 | Training Loss: 0.3613 | Val Loss: 0.2988 | Accuracy: 0.8766 | Precision: 0.8944 | Recall: 0.8453
Epoch 02 | Training Loss: 0.2540 | Val Loss: 0.2844 | Accuracy: 0.8860 | Precision: 0.8768 | Recall: 0.8899
Epoch 03 | Training Loss: 0.2398 | Val Loss: 0.2890 | Accuracy: 0.8816 | Precision: 0.8494 | Recall: 0.9187

Evaluating on test set...
Test Loss: 0.2832 | Test Accuracy: 0.8840 | Test Precision: 0.8645 | Test Recall: 0.9106

-----------------------------------------------------------------------------------------------------------------------

Experiment Run 2/5

Starting training...

Epoch 01 | Training Loss: 0.3575 | Val Loss: 0.2950 | Accuracy: 0.8802 | Precision: 0.8591 | Recall: 0.9006
Epoch 02 | Training Loss: 0.2551 | Val Loss: 0.2900 | Accuracy: 0.8788 | Precision: 0.8451 | Recall: 0.9183
Epoch 03 | Training Loss: 0.2411 | Val Loss: 0.2801 | Accuracy: 0.8858 | Precision: 0.8840 | Recall: 0.8800

Evaluating on test set...
Te

# Defining the Random Model

In [54]:
# ---------------------------
# Random MLP Class Definition
# ---------------------------
class MLP_rnd(object):
    def __init__(self, size_input, num_hidden, hidden_sizes, size_output, batch_size, optimizer, learning_rate, act_func, device=None):
        self.size_input = size_input # Number of features
        self.num_hidden = num_hidden # Number of hidden layers
        self.hidden_sizes = hidden_sizes # List of hidden layer sizes
        self.size_output = size_output # Number of classes (Binary Classification)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.optimizer = optimizer # Optimizer class (e.g. Adam, SGD, RMSprop)
        self.act_func = act_func # Activation function to use (e.g. ReLU, Tanh, Leaky ReLU)
        self.device = device # Device to run the model on (cpu or gpu)
        
        # Holds the model's weights and biases.
        self.variables = []
        
        # Initializes the model's weights and biases.
        w = tf.Variable(tf.random.normal([self.size_input, self.hidden_sizes[0]], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.hidden_sizes[0]]))
        self.variables.append((w, b))
        
        # Initializes the weights and biases for the hidden layers.
        for i in range(1, self.num_hidden):
            w = tf.Variable(tf.random.normal([self.hidden_sizes[i-1], self.hidden_sizes[i]], stddev=0.1))
            b = tf.Variable(tf.zeros([1, self.hidden_sizes[i]]))
            self.variables.append((w, b))
        
        # Initializes the weights and biases for the output layer.
        w = tf.Variable(tf.random.normal([self.hidden_sizes[-1], self.size_output], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.size_output]))
        self.variables.append((w, b))
        
        # Output layer weights and bias used specifically for back propagation.
        self.output_variables = [w, b]

    # Computes the output tensor during the forward pass. X is the input tensor.
    def forward(self, X):
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    # Computes the loss value given the predicted and actual output tensors.
    def loss(self, y_pred, y_true):
        # Tensor of shape (batch_size, size_output)
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    # Computes and applies gradients of the loss with respect to the output variables (weights and bias).
    def backward(self, X_train, y_train):
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)  
        
        grads = tape.gradient(current_loss, self.output_variables)
        
        optimizer = self.optimizer(learning_rate=self.learning_rate)
        optimizer.apply_gradients(zip(grads, self.output_variables))

    # Custom method to compute the output tensor during the forward pass.
    def compute_output(self, X):  
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        z = X_tf
        
        # Goes through each hidden layer and applies the weights and biases
        for w, b in self.variables:
            z = tf.matmul(z, w) + b
            
            # If the current layer is the last layer, then its the output layer.
            if (w, b) == self.variables[-1]:
                return z
            
            z = self.act_func(z)

# Testing the Random Model with Best Parameters

Below, I now run the same set of experiements I did on best model, but with the random model this time (using the best parameters found through hyper-parameter optimization):

In [55]:
# Ensures seeds are now random again.
tf.random.set_seed(None)
np.random.seed(None)


size_input = X_train_word.shape[1]

# Best parameters found from random search.
num_hidden = 1 # Number of hidden layers
hidden_sizes = [256] # Size of each hidden layer
batch_size = 32
optimizer = tf.keras.optimizers.Adam
learning_rate = 0.001
act_func = tf.nn.tanh # Activation function used
size_output  = 2 # Binary classification

results = []
experiment_runs = 5

for i in range(experiment_runs):
    print(f"\nExperiment Run {i+1}/{experiment_runs}")
    
    model = MLP_rnd(size_input=size_input,
                num_hidden=num_hidden,
                hidden_sizes=hidden_sizes,
                size_output=size_output,
                batch_size=batch_size,
                optimizer=optimizer,
                learning_rate=learning_rate,
                act_func=act_func,
                device=None)
    
    #Trains the model, tests it, and logs the results.
    train_model(model, X_train_word, X_val_word, y_train, y_val, epochs=3)
    pm = PerformanceMetrics(*test_model(model, X_test_word, y_test))
    results.append(Result(params, pm))

    print("\n-----------------------------------------------------------------------------------------------------------------------")

# Logs a summary of the accuracy from each run.
print("\nSummary of Experiment Runs:")

accuracies = [r.performance_metrics.accuracy for r in results]
for i, result in enumerate(results):
    print(f"Run {i+1}: Accuracy = {accuracies[i]:.4f}")

# Computes mean accuracy and standard error.
mean_accuracy = np.mean(accuracies)
standard_error = np.std(accuracies) / np.sqrt(len(accuracies))

print(f"\nMean Accuracy: {mean_accuracy:.4f}")
print(f"Standard Error: {standard_error:.4f}")


Experiment Run 1/5

Starting training...

Epoch 01 | Training Loss: 0.6670 | Val Loss: 0.5726 | Accuracy: 0.7024 | Precision: 0.6854 | Recall: 0.7137
Epoch 02 | Training Loss: 0.5539 | Val Loss: 0.5541 | Accuracy: 0.7204 | Precision: 0.7137 | Recall: 0.7067
Epoch 03 | Training Loss: 0.5447 | Val Loss: 0.5540 | Accuracy: 0.7130 | Precision: 0.7166 | Recall: 0.6749

Evaluating on test set...
Test Loss: 0.5588 | Test Accuracy: 0.7168 | Test Precision: 0.7391 | Test Recall: 0.6702

-----------------------------------------------------------------------------------------------------------------------

Experiment Run 2/5

Starting training...

Epoch 01 | Training Loss: 0.6439 | Val Loss: 0.5671 | Accuracy: 0.7094 | Precision: 0.6816 | Recall: 0.7517
Epoch 02 | Training Loss: 0.5474 | Val Loss: 0.5501 | Accuracy: 0.7238 | Precision: 0.7272 | Recall: 0.6885
Epoch 03 | Training Loss: 0.5390 | Val Loss: 0.5484 | Accuracy: 0.7228 | Precision: 0.7021 | Recall: 0.7438

Evaluating on test set...
Te