# Assignment 01: [creative title name]

### Maxim Ryabinov (U02204083)
### CAP4641: Natural Language Processing 
### Instructor: Dr. Ankur Mali 
### University of South Florida (Spring 2025)

---

# 1. Initial Setup
- Sets random seeds to ensure reproducibility.
- Creates 


In [16]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import random
import datetime


tf.random.set_seed(1234)
np.random.seed(1234)
random.seed(123)

# -------------------------------
# Character-Level Tokenizer and Preprocessing Functions
# -------------------------------
def create_tokenizer(texts, is_char_level, num_words=None):
    """
    Create and fit a character-level tokenizer.

    Args:
        texts (list of str): List of texts.
        num_words (int or None): Maximum number of tokens to keep.

    Returns:
        tokenizer: A fitted Tokenizer instance.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, char_level=is_char_level, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

def texts_to_bow(tokenizer, texts):
    """
    Convert texts to a bag-of-characters representation.

    Args:
        tokenizer: A fitted character-level Tokenizer.
        texts (list of str): List of texts.

    Returns:
        Numpy array representing the binary bag-of-characters for each text.
    """
    # texts_to_matrix with mode 'binary' produces a fixed-length binary vector per text.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix

def one_hot_encode(labels, num_classes=2):
    """
    Convert numeric labels to one-hot encoded vectors.
    """
    return np.eye(num_classes)[labels]

# Hyper-Parameter Random Search
def sample_hyperparameters():
    """
    Generate random hyper-parameters for the model.
    """
    learning_rates = [0.001, 0.0005, 0.0001]
    hidden_layers = [1, 2, 3]
    hidden_sizes = [128, 256, 512]
    batch_sizes = [32, 64, 128]
    optimizers = [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD, tf.keras.optimizers.RMSprop]
    activation_functions = [tf.nn.relu, tf.nn.tanh, tf.nn.leaky_relu]

    # Randomly sample one value from each category
    learning_rate = random.choice(learning_rates)
    num_layers = random.choice(hidden_layers)
    picked_size = random.choice(hidden_sizes)
    hidden_sizes = [picked_size for i in range(num_layers)]
    batch_size = random.choice(batch_sizes)
    optimizer = random.choice(optimizers)
    act_func = random.choice(activation_functions)

    return learning_rate, num_layers, hidden_sizes, batch_size, optimizer, act_func
    

# -------------------------------
# For logging purposes
# -------------------------------
def log_training_info(epoch, epoch_loss, val_loss, accuracy, precision, recall, log_filename="training_logs.txt"):
    # Get the current timestamp to label the start of the run
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Prepare the log entry for each epoch
    log_entry = (f"\n[{timestamp}] - Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
                 f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}\n")
    
    # Append to the log file
    with open(log_filename, 'a') as log_file:
        log_file.write(log_entry)

def log_final_evaluation(test_loss, test_accuracy, test_precision, test_recall, log_filename="training_logs.txt"):
    # Get the current timestamp to label the final evaluation section
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Prepare the final evaluation log entry
    final_log_entry = (f"\nFinal Evaluation on Test Set:\n"
                       f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
                       f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}\n"
                       f"\n=======================================================================================================================\n")
    
    # Append final evaluation to the log file
    with open(log_filename, 'a') as log_file:
        log_file.write(final_log_entry)



# -------------------------------
# Load and Prepare the IMDB Dataset
# -------------------------------
print("Loading IMDB dataset...")
# Load the IMDB reviews dataset with the 'as_supervised' flag so that we get (text, label) pairs.
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
                                           split=['train', 'test'],
                                           as_supervised=True,
                                           with_info=True)

# Convert training dataset to lists.
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    # Decode byte strings to utf-8 strings.
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create a validation set from the training data (20% for validation).
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert test dataset to lists.
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}")









Loading IMDB dataset...
Train samples: 20000, Validation samples: 5000, Test samples: 25000


# 2. Tokenizing

In [6]:
# -------------------------------
# Preprocessing: Tokenization and Vectorization
# -------------------------------
# Build the character-level tokenizer on the training texts.
char_tokenizer = create_tokenizer(train_texts, True)
print("Char tokenizer vocabulary size:", len(char_tokenizer.word_index) + 1)

word_tokenizer = create_tokenizer(train_texts, False)
print("Word tokenizer vocabulary size:", len(word_tokenizer.word_index) + 1)


# Convert texts to bag-of-characters representation.
X_train = texts_to_bow(char_tokenizer, train_texts)
X_val   = texts_to_bow(char_tokenizer, val_texts)
X_test  = texts_to_bow(char_tokenizer, test_texts)

# Convert labels to one-hot encoding.
y_train = one_hot_encode(train_labels)
y_val   = one_hot_encode(val_labels)
y_test  = one_hot_encode(test_labels)


Char tokenizer vocabulary size: 134
Word tokenizer vocabulary size: 80169


# 3. Defining the MLP Model

In [7]:
# -------------------------------
# Original MLP Class Definition
# -------------------------------
class MLP(object):
    def __init__(self, size_input, num_hidden, hidden_sizes, size_output, batch_size, optimizer, learning_rate, act_func, device=None):
        """
        size_input: int, size of input layer
        size_hidden1: int, size of the 1st hidden layer
        size_hidden2: int, size of the 2nd hidden layer
        size_hidden3: int, size of the 3rd hidden layer (not used in compute_output here)
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None.
        """
        self.size_input = size_input
        self.num_hidden = num_hidden
        self.hidden_sizes = hidden_sizes
        # self.size_hidden1 = size_hidden1
        # self.size_hidden2 = size_hidden2
        # self.size_hidden3 = size_hidden3  # (Currently not used in the forward pass)
        self.size_output = size_output
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.device = device
        self.act_func = act_func
        
        self.hidden_layers = []
        
        w = tf.Variable(tf.random.normal([self.size_input, self.hidden_sizes[0]], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.hidden_sizes[0]]))
        self.hidden_layers.append((w, b))
        
        for i in range(1, self.num_hidden):
            w = tf.Variable(tf.random.normal([self.hidden_sizes[i-1], self.hidden_sizes[i]], stddev=0.1))
            b = tf.Variable(tf.zeros([1, self.hidden_sizes[i]]))
            self.hidden_layers.append((w, b))
        
        w = tf.Variable(tf.random.normal([self.hidden_sizes[-1], self.size_output], stddev=0.1))
        b = tf.Variable(tf.zeros([1, self.size_output]))
        self.hidden_layers.append((w, b))
        

        # # Initialize weights and biases for first hidden layer
        # self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1], stddev=0.1))
        # self.b1 = tf.Variable(tf.zeros([1, self.size_hidden1]))

        # # Initialize weights and biases for second hidden layer
        # self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2], stddev=0.1))
        # self.b2 = tf.Variable(tf.zeros([1, self.size_hidden2]))

        # # Initialize weights and biases for third hidden layer
        # self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_hidden3], stddev=0.1))
        # self.b3 = tf.Variable(tf.zeros([1, self.size_hidden3]))
        
        # # Initialize weights and biases for output layer
        # self.W4 = tf.Variable(tf.random.normal([self.size_hidden3, self.size_output], stddev=0.1))
        # self.b4 = tf.Variable(tf.zeros([1, self.size_output]))

        # # List of variables to update during backpropagation
        # self.variables = [self.W1, self.W2, self.W3, self.W4, self.b1, self.b2, self.b3, self.b4]

    def forward(self, X):
        """
        Forward pass.
        X: Tensor, inputs.
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    def loss(self, y_pred, y_true):
        """
        Computes the loss between predicted and true outputs.
        y_pred: Tensor of shape (batch_size, size_output)
        y_true: Tensor of shape (batch_size, size_output)
        """
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    def backward(self, X_train, y_train):
        """
        Backward pass: compute and apply gradients of the loss with respect to the variables.
        """
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        
        trainable_vars = [var for w, b in self.hidden_layers for var in (w, b)]    
        
        grads = tape.gradient(current_loss, trainable_vars)
        
        optimizer = self.optimizer(learning_rate=self.learning_rate)
        optimizer.apply_gradients(zip(grads, trainable_vars))

    def compute_output(self, X):
        """
        Custom method to compute the output tensor during the forward pass.
        """        
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        z = X_tf
        
        for w, b in self.hidden_layers:
            z = tf.matmul(z, w) + b
            
            if (w, b) == self.hidden_layers[-1]:
                return z
            
            z = self.act_func(z)
            
        
        # output = tf.matmul(z)
        
        # # First hidden layer
        # h1 = tf.matmul(X_tf, self.W1) + self.b1
        # z1 = self.act_func(h1)
        
        # # Second hidden layer
        # h2 = tf.matmul(z1, self.W2) + self.b2
        # z2 = self.act_func(h2)

        # # Third hidden layer
        # h3 = tf.matmul(z2, self.W3) + self.b3
        # z3 = self.act_func(h3)

        # # Output layer (logits)
        # output = tf.matmul(z3, self.W4) + self.b4
        # return output

# 4. Running the Model

In [17]:


# -------------------------------
# Model Setup
# -------------------------------
# The input size is determined by the dimension of the bag-of-characters vector.
size_input = X_train.shape[1]

# Set hidden layer sizes as desired.
# hidden_sizes = [128, 64, 32]
# size_hidden1 = 128
# size_hidden2 = 64
# size_hidden3 = 32  # Placeholder (not used in the forward pass)
# size_output  = 2

# Instantiate the MLP model.
# model = MLP(size_input,
#             num_hidden=3,
#             hidden_sizes=[128, 64, 32],
#             size_output=2,
#             batch_size=128,
#             optimizer=tf.keras.optimizers.Adam,
#             learning_rate=0.001,
#             act_func=tf.nn.relu)

learning_rate, num_layers, hidden_sizes, batch_size, optimizer, act_func = sample_hyperparameters()
size_output  = 2

print(learning_rate, num_layers, hidden_sizes, batch_size, optimizer, act_func, sep="\n")

model = MLP(size_input=size_input,
            num_hidden=num_layers,
            hidden_sizes=hidden_sizes,
            size_output=size_output,
            batch_size=batch_size,
            optimizer=optimizer,
            learning_rate=learning_rate,
            act_func=act_func,
            device=None)

# Define the optimizer.
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001).apply_gradients(zip(grads, model.variables))

# -------------------------------
# Training Parameters and Loop
# -------------------------------
# batch_size = 128
for i in range(3):
    epochs = 3
    num_batches = int(np.ceil(X_train.shape[0] / model.batch_size))

    print("\nStarting training...\n")
    for epoch in range(epochs):
        # Shuffle training data at the start of each epoch.
        indices = np.arange(X_train.shape[0])
        np.random.shuffle(indices)
        X_train = X_train[indices]
        y_train = y_train[indices]

        epoch_loss = 0
        for i in range(num_batches):
            start = i * model.batch_size
            end = min((i+1) * model.batch_size, X_train.shape[0])
            X_batch = X_train[start:end]
            y_batch = y_train[start:end]

            # Compute gradients and update weights.
            # with tf.GradientTape() as tape:
            #     predictions = model.forward(X_batch)
            #     loss_value = model.loss(predictions, y_batch)
            # grads = tape.gradient(loss_value, model.variables)
            predictions = model.forward(X_batch)
            loss_value = model.loss(predictions, y_batch)
            model.backward(X_batch, y_batch)
            #grads = model.backward(X_batch, y_batch)
            #optimizer.apply_gradients(zip(grads, model.variables))
            epoch_loss += loss_value.numpy() * (end - start)

        epoch_loss /= X_train.shape[0]

        # Evaluate on validation set.
        val_logits = model.forward(X_val)
        val_loss = model.loss(val_logits, y_val).numpy()
        val_preds = np.argmax(val_logits.numpy(), axis=1)
        true_val = np.argmax(y_val, axis=1)
        accuracy = np.mean(val_preds == true_val)
        precision = precision_score(true_val, val_preds)
        recall = recall_score(true_val, val_preds)

        print(f"Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
            f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")
        
        log_training_info(epoch, epoch_loss, val_loss, accuracy, precision, recall)

    # -------------------------------
    # Final Evaluation on Test Set
    # -------------------------------
    print("\nEvaluating on test set...")
    test_logits = model.forward(X_test)
    test_loss = model.loss(test_logits, y_test).numpy()
    test_preds = np.argmax(test_logits.numpy(), axis=1)
    true_test = np.argmax(y_test, axis=1)
    test_accuracy = np.mean(test_preds == true_test)
    test_precision = precision_score(true_test, test_preds)
    test_recall = recall_score(true_test, test_preds)

    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
        f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

    log_final_evaluation(test_loss, test_accuracy, test_precision, test_recall)

0.001
2
[128, 128]
64
<class 'keras.src.optimizers.sgd.SGD'>
<function relu at 0x707dd859b910>

Starting training...

Epoch 01 | Training Loss: 0.6992 | Val Loss: 0.6939 | Accuracy: 0.5052 | Precision: 0.4888 | Recall: 0.4517
Epoch 02 | Training Loss: 0.6929 | Val Loss: 0.6922 | Accuracy: 0.5160 | Precision: 0.5008 | Recall: 0.5281
Epoch 03 | Training Loss: 0.6910 | Val Loss: 0.6906 | Accuracy: 0.5294 | Precision: 0.5128 | Recall: 0.5862

Evaluating on test set...
Test Loss: 0.6906 | Test Accuracy: 0.5322 | Test Precision: 0.5295 | Test Recall: 0.5778

Starting training...

Epoch 01 | Training Loss: 0.6892 | Val Loss: 0.6885 | Accuracy: 0.5388 | Precision: 0.5259 | Recall: 0.4934
Epoch 02 | Training Loss: 0.6876 | Val Loss: 0.6868 | Accuracy: 0.5486 | Precision: 0.5368 | Recall: 0.5025
Epoch 03 | Training Loss: 0.6860 | Val Loss: 0.6858 | Accuracy: 0.5620 | Precision: 0.5421 | Recall: 0.6221

Evaluating on test set...
Test Loss: 0.6857 | Test Accuracy: 0.5636 | Test Precision: 0.5589 |