# Import

In [57]:
import numpy as np

In [58]:
SAVE_OUTPUT = True

In [59]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # reinitialize the layer
    def reset(self):
        self.input = None
        self.output = None
        
    def restore(self, input, output):
        self.input = input
        self.output = output
        
    def save(self):
        return (self.input, self.output)
        
    def forward(self, input, training=True):
        pass

    def backward(self, d_output, learning_rate):
        pass

In [60]:
class Flatten(Layer):
    def __init__(self, input_shape):
        self.input_shape = input_shape
        
    def reset(self):
        pass
    
    def save(self):
        return {}
    
    def restore(self):
        pass
    
    def forward(self, inputs, training=True):
        if len(inputs.shape) <= 1:
           raise ValueError(f"Flatten layer requires input with more than 1 dimension. Received shape: {inputs.shape}.")

        if inputs.shape[1:] != self.input_shape:
            raise ValueError(f"Input shape {inputs.shape[1:]} does not match expected shape {self.input_shape}.")
        
        return inputs.reshape(inputs.shape[0], -1)
    
    def backward(self, d_output, optimizer=None):
        batch_size = d_output.shape[0]
        return d_output.reshape(batch_size, *self.input_shape)

# Optimization: Adaptive Moment Estimation (Adam)

In [61]:
class Optimizer:
    def __init__(self, params):
        pass
    
    def update(self, layer):
        pass

In [62]:
class Adam(Optimizer):
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
            
    def update(self, param, gradient, m, v, t):
        m = self.beta1 * m + (1 - self.beta1) * gradient
        v = self.beta2 * v + (1 - self.beta2) * np.power(gradient, 2)
        m_hat = m / (1 - np.power(self.beta1, t))
        v_hat = v / (1 - np.power(self.beta2, t))
        param = param - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
        return param, m, v

# Dense Layer: A fully connected layer, defined by the dimensions of its input and output.

In [65]:
class Dense(Layer):
    def __init__(self, input_size, output_size, activation=None):
        # Initialize weights and biases with float32
        self.weights = np.random.randn(input_size, output_size).astype(np.float32) * 0.01
        self.bias = np.zeros((1, output_size), dtype=np.float32)
        self.activation = activation
        
        # For optimizer
        self.m_w = np.zeros((input_size, output_size), dtype=np.float32)  # momentum for weights
        self.v_w = np.zeros((input_size, output_size), dtype=np.float32)  # velocity for weights
        self.m_b = np.zeros((1, output_size), dtype=np.float32)  # momentum for bias
        self.v_b = np.zeros((1, output_size), dtype=np.float32)  # velocity for bias
        self.t = 0 # step counter for bias correction

    def reset(self):
        self.weights = np.random.randn(self.weights.shape[0], self.weights.shape[1]).astype(np.float32) * 0.01
        self.bias = np.zeros((1, self.bias.shape[1]), dtype=np.float32)
        
        self.m_w = np.zeros_like(self.m_w)
        self.v_w = np.zeros_like(self.v_w)
        self.m_b = np.zeros_like(self.m_b)
        self.v_b = np.zeros_like(self.v_b)
        self.t = 0
        
    def restore(self, weights, bias, m_w, v_w, m_b, v_b, t):
        self.weights = weights.astype(np.float32)
        self.bias = bias.astype(np.float32)
        
        self.m_w = m_w.astype(np.float32)
        self.v_w = v_w.astype(np.float32)
        self.m_b = m_b.astype(np.float32)
        self.v_b = v_b.astype(np.float32)
        self.t = t
        
    def save(self):
        return {
            'weights': self.weights.astype(np.float32),
            'bias': self.bias.astype(np.float32),
            'm_w': self.m_w.astype(np.float32),
            'v_w': self.v_w.astype(np.float32),
            'm_b': self.m_b.astype(np.float32),
            'v_b': self.v_b.astype(np.float32),
            't': self.t
        }
        
    def forward(self, X, training=True):
        self.input = X  # Save input for backpropagation
        self.output = np.dot(X, self.weights) + self.bias
            # if self.activation == 'relu':
            #     self.output = np.maximum(0, self.output)
                
        return self.output

    def backward(self, d_output, optimizer):
        # Compute gradients for weights and bias
        # Output, O = XW + b 
        # d/dW (O) = X
        # d/dW (L) = d/dW (O) * d/dO (L) = X * d/dO (L) = X * d_output
        d_weights = np.dot(self.input.T, d_output)
        d_bias = np.sum(d_output, axis=0, keepdims=True)

        self.t += 1
        self.weights, self.m_w, self.v_w = optimizer.update(self.weights, d_weights, self.m_w, self.v_w, self.t)
        self.bias, self.m_b, self.v_b = optimizer.update(self.bias, d_bias, self.m_b, self.v_b, self.t)

        # Compute gradient with respect to the input
        d_input = np.dot(d_output, self.weights.T)
        return d_input

# Batch Normalization

In [66]:
class BatchNormalization(Layer):
    def __init__(self, input_size, learning_rate=0.001, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.gamma =  np.ones((1, input_size), dtype=np.float32)
        self.beta = np.zeros((1, input_size), dtype=np.float32)
        self.moving_mean = np.zeros((1, input_size), dtype=np.float32)
        self.moving_var = np.ones((1, input_size), dtype=np.float32)
        self.lr = learning_rate
        
    def reset(self):
        self.gamma = np.ones_like(self.gamma, dtype=np.float32)
        self.beta = np.zeros_like(self.beta, dtype=np.float32)
        self.moving_mean = np.zeros_like(self.moving_mean, dtype=np.float32)
        self.moving_var = np.ones_like(self.moving_var, dtype=np.float32)
        
    def restore(self, gamma, beta, moving_mean, moving_var):
        self.gamma = gamma.astype(np.float32)
        self.beta = beta.astype(np.float32)
        self.moving_mean = moving_mean.astype(np.float32)
        self.moving_var = moving_var.astype(np.float32)
        
    def save(self): 
        return {
            'gamma': self.gamma.astype(np.float32),
            'beta': self.beta.astype(np.float32),
            'moving_mean': self.moving_mean.astype(np.float32), 
            'moving_var': self.moving_var.astype(np.float32)
        }

    def forward(self, X, training=True):
        self.input = X
        if training:
            # When using a fully connected layer, calculate the mean and
            # variance on the feature dimension
            batch_mean = np.mean(X, axis=0, keepdims=True).astype(np.float32)
            batch_var = np.var(X, axis=0, keepdims=True).astype(np.float32)
            
            # In training mode, the current mean and variance are used
            self.X_centered = X - batch_mean
            self.stddev_inv = 1.0 / np.sqrt(batch_var + self.eps)
            self.X_hat = self.X_centered * self.stddev_inv

            # Update the mean and variance using moving average - Needed for inference
            self.moving_mean = (1.0 - self.momentum) * self.moving_mean + self.momentum * batch_mean
            self.moving_var = (1.0 - self.momentum) * self.moving_var + self.momentum * batch_var
        else:
            self.X_hat = (X - self.moving_mean) / np.sqrt(self.moving_var + self.eps)

        self.output = self.gamma * self.X_hat + self.beta
        return self.output.astype(np.float32)

    def backward(self, d_output, optimizer=None):
        N, D = d_output.shape

        self.d_gamma = np.sum(d_output * self.X_hat, axis=0)
        self.d_beta = np.sum(d_output, axis=0)

        d_X_norm = d_output * self.gamma
        d_var = np.sum(d_X_norm * self.X_centered * -0.5 * np.power(self.stddev_inv,3), axis=0)
        d_mean = np.sum(d_X_norm * -self.stddev_inv, axis=0) + d_var * np.mean(-2. * self.X_centered, axis=0)

        d_input = (d_X_norm * self.stddev_inv) + (d_var * 2 * self.X_centered / N) + (d_mean / N)

        # Update parameters
        self.gamma -= self.lr * self.d_gamma
        self.beta -= self.lr * self.d_beta

        return d_input

# Activation: ReLU

In [68]:
class ReLU(Layer):
    def __init__(self):
        pass
    
    def reset(self):
        pass
    
    def restore(self):
        pass
    
    def save(self):
        return {}
        
    def forward(self, X, training=True):
        self.input = X
        return np.maximum(0, self.input)

    def backward(self, d_output, optimizer=None):
        # For positive values of input, the gradient is 1.
        # For non-positive values (0 or negative) of input, the gradient is 0.
        return d_output * (self.input > 0)

# Regularization: Dropout

In [69]:
class Dropout(Layer):
    def __init__(self, dropout_rate):
        assert 0 <= dropout_rate <= 1
        self.dropout_rate = dropout_rate
        
    def reset(self):
        pass
    
    def restore(self):
        pass
    
    def save(self):
        return {}

    def forward(self, X, training=True):
        self.input = X
        if training: # Only apply dropout during training!!
            if self.dropout_rate == 1.0:
                return np.zeros_like(X)
                
            self.mask = np.random.binomial(1, 1 - self.dropout_rate, size=X.shape) 
            
            # Need to normalize the values to keep the expected value the same
            return (X * self.mask) / (1 - self.dropout_rate)
        else:
            return X

    def backward(self, d_output, optimizer=None):
        return d_output * self.mask

# Regression: Softmax for Multi-class Classification

In [70]:
class Softmax(Layer):
    def __init__(self):
        pass
    
    def reset(self):
        pass
    
    def restore(self):
        pass
    
    def save(self):
        return {}
    
    def forward(self, X, training=True):
        self.input = X
        exps = np.exp(X - np.max(X, axis=1, keepdims=True))
        self.output = exps / np.sum(exps, axis=1, keepdims=True)
        return self.output

    def backward(self, d_output, optimizer=None):
        d_input = self.output * (d_output - np.sum(self.output * d_output, axis=-1, keepdims=True))
        return d_input
    
    # def backward(self, d_output, optimizer=None):
    #     d_input = d_output
    #     return d_input
    

# Cross-Entropy Loss

In [72]:
class Loss:
    def __init__(self):
        pass
    
    def forward(self, y_true, y_pred):
        pass
    
    def backward(self, y_true, y_pred):
        pass

In [73]:
class CategoricalCrossEntropyLoss(Loss):
    def forward(self, y_pred, y_true):
        # Clip predictions to avoid log(0) or log(negative)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

        n_samples = len(y_true)
        
        log_likelihood = y_true * np.log(y_pred)

        # Compute the loss
        loss = -(1 / n_samples) * np.sum(log_likelihood)
        
        return loss

    def backward(self, y_pred, y_true):
        epsilon = 1e-15    
        n_samples = len(y_true)
        
        y_pred = np.maximum(y_pred, epsilon)
        d_input = -y_true / y_pred
        
        return (1 / n_samples) * d_input

    # https://stackoverflow.com/a/76532286: The usual workaround to prevent computing the Jacobian of the softmax, is to simply take derivatives of the Loss function, with respect to the inputs passed to softmax, instead of the outputs passed by softmax. 
    # def backward(self, y_pred, y_true):
    #     d_input_of_softmax = y_pred - y_true
    #     return d_input_of_softmax

# Neural Network

In [74]:
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd
import os
import shutil
import pickle

class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers

    def forward(self, X, training=True):
        """Perform the forward pass through all layers."""
        for layer in self.layers:
            X = layer.forward(X, training)

        return X

    def backward(self, d_output):
        """Perform the backward pass through all layers."""
        for layer in reversed(self.layers):
            d_output = layer.backward(d_output, self.optimizer)

    def save(self, file_path):
        state = {'layers': [layer.save() for layer in self.layers]}
        
        with open(file_path, 'wb') as f:
            pickle.dump(state, f)

    def restore(self, file_path):
        with open(file_path, 'rb') as f:
            state = pickle.load(f)
        for layer, layer_state in zip(self.layers, state['layers']):
            # print(layer_state)
            layer.restore(**layer_state)
            
    def compile(self, optimizer, loss):
        self.optimizer = optimizer
        self.loss = loss
            
    def train(self, X_train, y_train, model_id, lr_id, X_val = None, y_val = None, epochs = 25, batch_size = 32, DEBUG=False):
        """Train the network using mini-batch gradient descent."""
        n_samples = X_train.shape[0]
        train_losses = []
        val_losses = []
        train_acc = []
        val_acc = []
        val_f1_scores = []

        if SAVE_OUTPUT and DEBUG:
            output_dir = f'report/logs/{model_id}/{lr_id}'
            
            # Clear the folder if it exists
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)
                
            os.makedirs(output_dir, exist_ok=True)
            
        for epoch in range(epochs):
            # Shuffle data
            indices = np.arange(n_samples)
            np.random.shuffle(indices)
            
            X_train = X_train[indices]
            y_train = y_train[indices]

            epoch_loss = 0
            correct_predictions = 0

            # Mini-batch gradient descent
            for start_idx in range(0, n_samples, batch_size):
                end_idx = start_idx + batch_size
                X_batch = X_train[start_idx:end_idx]
                y_batch = y_train[start_idx:end_idx]

                # Forward pass
                predictions = self.forward(X_batch)
                # Compute the loss
                epoch_loss +=  self.loss.forward(predictions, y_batch)

                # Backward pass (backpropagation)
                d_output = self.loss.backward(predictions, y_batch)
                self.backward(d_output)
                
                # Calculate training accuracy
                pred_labels = np.argmax(predictions, axis=1)
                true_labels = np.argmax(y_batch, axis=1)
                correct_predictions += np.sum(pred_labels == true_labels)

            # Average loss over the epoch
            avg_loss = epoch_loss / (n_samples // batch_size)
            train_losses.append(avg_loss)
            
            train_accuracy = correct_predictions / n_samples
            train_acc.append(train_accuracy)

            
            # Add validation loss 
            if X_val is not None and y_val is not None:
                val_predictions = self.forward(X_val)
                val_loss = self.loss.forward(val_predictions, y_val)
                val_losses.append(val_loss)

                # Save in log file
                if SAVE_OUTPUT and DEBUG:
                    with open(os.path.join(output_dir, 'log.txt'), 'a') as f:
                        f.write(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss}, Validation Loss: {val_loss}\n")
                        
                if epoch % 5 == 0:
                    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss}, Validation Loss: {val_loss}")

                    
                        
                # Calculate validation accuracy
                val_pred_labels = np.argmax(val_predictions, axis=1)
                val_true_labels = np.argmax(y_val, axis=1)
                val_accuracy = accuracy_score(val_true_labels, val_pred_labels)
                val_acc.append(val_accuracy)

                f1 = f1_score(val_true_labels,
                              val_pred_labels, average='macro')
                val_f1_scores.append(f1)
        
        if X_val is not None and y_val is not None:
            if SAVE_OUTPUT and DEBUG:
                output_dir = f'report/images/{model_id}/{lr_id}'
                os.makedirs(output_dir, exist_ok=True)
            # Plotting the training and validation loss
            plt.figure(figsize=(10, 5))
            plt.plot(train_losses, label='Training Loss')
            plt.plot(val_losses, label='Validation Loss')
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.title('Training and Validation Loss')
            plt.legend()

            if SAVE_OUTPUT and DEBUG:
                plt.savefig(os.path.join(output_dir, 'training_validation_loss.png'))
            plt.show()

            # Plotting the training and validation acc
            plt.figure(figsize=(10, 5))
            plt.plot(train_acc, label='Training Accuracy')
            plt.plot(val_acc, label='Validation Accuracy')
            plt.xlabel('Epochs')
            plt.ylabel('Accuracy')
            plt.title('Training and Validation Accuracy')
            plt.legend()

            if SAVE_OUTPUT and DEBUG:
                plt.savefig(os.path.join(output_dir, 'training_validation_accuracy.png'))
            plt.show()

            # Plotting the validation F1 score
            plt.figure(figsize=(10, 5))
            plt.plot(val_f1_scores, label='Validation F1 Score')
            plt.xlabel('Epochs')
            plt.ylabel('F1 Score')
            plt.title('Validation F1 Score')
            plt.legend()

            if SAVE_OUTPUT and DEBUG:
                plt.savefig(os.path.join(output_dir, 'validation_f1_score.png'))
            plt.show()

            # Plot confusion matrix
            # val_predictions = self.forward(X_val)
            val_pred_labels = np.argmax(val_predictions, axis=1)
            val_true_labels = np.argmax(y_val, axis=1)
            conf_matrix = confusion_matrix(val_true_labels, val_pred_labels)
            plt.figure(figsize=(10, 7))
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(10), yticklabels=range(10))
            plt.xlabel('Predicted Labels')
            plt.ylabel('True Labels')
            plt.title('Confusion Matrix')

            if SAVE_OUTPUT and DEBUG:
                plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
            plt.show()

            return val_f1_scores[-1]

In [75]:
def one_hot_encode(y, num_classes=10):
    return np.eye(num_classes)[y]

# Load Dataset

In [None]:

from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import pickle

np.random.seed(1905072)
transform = transforms.ToTensor()
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)

# Initialize empty lists to store X_train and y_train
X = []
y = []

# Iterate over the dataset
for img, label in train_dataset:
    X.append(img.numpy().squeeze(0))  # Convert the tensor to NumPy array
    y.append(label)        # Labels are already integers, so just append them

# Convert the list of arrays to NumPy arrays
X = np.array(X)
y = np.array(y)

# Apply one-hot encoding to y_train
y_one_hot = one_hot_encode(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_one_hot, test_size=0.15, random_state=42)

# Print the shapes of the datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

learning_rates = [5e-3,1e-3,5e-4,1e-4]

best_model = None
best_f1 = 0.0
best_lr = 0.0
best_idx = -1

def get_model(model_id, lr):
    if model_id == 0:
        return [
            Flatten(input_shape=(28,28)), # Dropout(0.2), 
            Dense(784, 256), BatchNormalization(256, lr), ReLU(), Dropout(0.2), 
            Dense(256, 10), Softmax()
        ]
    elif model_id == 1: 
        return [
            Flatten(input_shape=(28,28)), # Dropout(0.2), 
            Dense(784, 180), BatchNormalization(180, lr), ReLU(), Dropout(0.2), 
            Dense(180, 80), BatchNormalization(80), ReLU(), Dropout(0.2),
            Dense(80, 10), Softmax()
        ]
    elif model_id == 2:
        return [
            Flatten(input_shape=(28,28)), # Dropout(0.2), 
            Dense(784, 150), BatchNormalization(150, lr), ReLU(), Dropout(0.2),
            Dense(150, 100), BatchNormalization(100, lr), ReLU(), Dropout(0.2),
            Dense(100, 50), BatchNormalization(50, lr), ReLU(), Dropout(0.2),
            Dense(50, 10), Softmax()
        ]
        
        
# Best Model
best_idx = 0
best_lr = 2

# Train

In [None]:
# for j in range(len(learning_rates) - 1, -1, -1):
for j in range(0,len(learning_rates)):
    learning_rate = learning_rates[j]
    optimizer = Adam(learning_rate=learning_rate)
    loss = CategoricalCrossEntropyLoss()
    networks = [
        get_model(0, learning_rate), 
        get_model(1, learning_rate),
        get_model(2, learning_rate),
    ]
    
    # for i in range(len(networks) - 1, -1, -1):
    for i in range(0,len(networks)):
        for layer in networks[i]:
            layer.reset()
        
        print("Training Model:", i, "Learning Rate:", learning_rate)
        
        layers = networks[i]
        nn = NeuralNetwork(layers)
        nn.compile(optimizer, loss)
        f1 = nn.train(X_train, y_train, X_val = X_val, y_val = y_val, epochs=25, batch_size=32, model_id=i, lr_id=j, DEBUG=True)

        print("Model:", i, "Learning Rate:", learning_rate, "F1 Score:", f1)

        if SAVE_OUTPUT:
            # Save F1 score
            output_dir = f'report/results/{i}/{j}'
            os.makedirs(output_dir, exist_ok=True)
            with open(os.path.join(output_dir, "f1_scores.log"), 'w') as file:
                file.write(f"{f1}")
       
            # Save the model
            output_dir = f'models/{i}/{j}'
            os.makedirs(output_dir, exist_ok=True)
            with open(os.path.join(output_dir, 'trained_model.pkl'), 'wb') as f:
                pickle.dump(nn, f)

            # Save the model
            nn.save(os.path.join(output_dir, 'model_params.pkl'))

        if f1 >= best_f1:
            best_f1 = f1
            best_lr = j
            best_idx = i
            
print("Best F1 Score:", best_f1) 
print("Best Model:", best_idx)
print("Best Learning Rate:", learning_rates[best_lr])


# Save the Best Model

In [55]:
model = NeuralNetwork(get_model(best_idx, learning_rates[best_lr]))
model.restore(f'models/{best_idx}/{best_lr}/model_params.pkl')
model.save('model_1905072.pickle')

# Test

In [77]:
import pickle
import numpy as np
from torchvision import datasets, transforms

# For reproducibility - Comment if retrains


model = NeuralNetwork(get_model(best_idx, learning_rates[best_lr]))
model.restore(f'model_1905072.pickle')

# Transform to convert PIL image to tensor
transform = transforms.ToTensor()

# Assuming you will be given a dataset for testing
test_dataset = datasets.FashionMNIST(
    root='./data', train=False, download=True, transform=transform)

# Get test images and labels (this is provided, and you should load it appropriately)
X_test = []
y_test = []

for img, label in test_dataset:
    X_test.append(img.numpy().squeeze(0))
    y_test.append(label)

# Convert the list of arrays to NumPy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)

# Flatten the input if it's not already flattened
# if len(X_test.shape) > 2:
#     X_test = X_test.reshape(X_test.shape[0], -1)

# Fit to validation data
optimizer = Adam(learning_rate=learning_rates[best_lr])
loss = CategoricalCrossEntropyLoss()

# Forward pass through the trained model to get predictions
predictions = model.forward(X_test, training=False)

# Find the predicted class for each sample (assuming the last layer is softmax)
predicted_labels = np.argmax(predictions, axis=1)

# Output the predictions
# print(predicted_labels)

# If you need to compute the accuracy against ground truth
accuracy = np.mean(predicted_labels == y_test)
print(f"Test Accuracy: {accuracy * 100:.3f}%")

# Find f1 macro
from sklearn.metrics import f1_score
f1 = f1_score(y_test, predicted_labels, average='macro')
print(f"F1 Score: {f1}")

Test Accuracy: 89.240%
F1 Score: 0.8919878734738692


In [None]:
import pickle
import numpy as np
from torchvision import datasets, transforms

# For reproducibility - Comment if retrains
# best_idx = 0
# best_lr = 2

# Iterate for best_idx and best_lr

# for best_idx in range(3):
#     for best_lr in range(4):
#         model = NeuralNetwork(get_model(best_idx, learning_rates[best_lr]))
#         model.restore(f'models/{best_idx}/{best_lr}/model_params.pkl')

#         # Transform to convert PIL image to tensor
#         transform = transforms.ToTensor()

#         # Assuming you will be given a dataset for testing
#         test_dataset = datasets.FashionMNIST(
#             root='./data', train=False, download=True, transform=transform)

#         # Get test images and labels (this is provided, and you should load it appropriately)
#         X_test = []
#         y_test = []

#         for img, label in test_dataset:
#             X_test.append(img.numpy().squeeze(0))
#             y_test.append(label)

#         # Convert the list of arrays to NumPy arrays
#         X_test = np.array(X_test)
#         y_test = np.array(y_test)

#         # Flatten the input if it's not already flattened
#         # if len(X_test.shape) > 2:
#         #     X_test = X_test.reshape(X_test.shape[0], -1)

#         # Fit to validation data
#         optimizer = Adam(learning_rate=learning_rates[best_lr])
#         loss = CategoricalCrossEntropyLoss()

#         # Forward pass through the trained model to get predictions
#         predictions = model.forward(X_test, training=False)

#         # Find the predicted class for each sample (assuming the last layer is softmax)
#         predicted_labels = np.argmax(predictions, axis=1)

#         # Output the predictions
#         # print(predicted_labels)

#         # If you need to compute the accuracy against ground truth
#         accuracy = np.mean(predicted_labels == y_test)
#         print(f"Test Accuracy: {accuracy * 100:.3f}%")

#         # Find f1 macro
#         from sklearn.metrics import f1_score
#         f1 = f1_score(y_test, predicted_labels, average='macro')
#         print(f"F1 Score: {f1}")
        
#         output_dir = f'report/results/{best_idx}/{best_lr}'
#         os.makedirs(output_dir, exist_ok=True)
#         with open(os.path.join(output_dir, "test_acc_scores.log"), 'w') as file:
#             file.write(f"{accuracy}")
            
#         with open(os.path.join(output_dir, "test_f1_scores.log"), 'w') as file:
#             file.write(f"{f1}")