In [40]:
import numpy as np
from nnfs.datasets import spiral_data
import nnfs
import matplotlib.pyplot as plt

In [41]:
class Layer:
    # Layer initialization
    def __init__(self, n_inputs, n_neurons, weight_regularizer_l1=0, weight_regularizer_l2=0, bias_regularizer_l1=0, bias_regularizer_l2=0):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
        # Set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

## Regularization Loss Calculation

In [42]:
class Loss:
    # Regularization loss calculation
    def regularization_loss(self, layer):
        regularization_loss = 0
        # L1 & L2 regularizaiton - weights. Calculate only when regularization factor is > 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        
        # L1 & L2 regularizaiton - biases. Calculate only when regularization factor is > 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

### Backward Pass with Regularization

In [52]:
class Layer_Dense:
    # Backward Pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        
        # Gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights <0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        
        # L2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases <0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        
        # L2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
        
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)        

### Creating a Layer and Loss Class with Regularization

### Complete Layer Dense Class

In [53]:
class Layer_Dense:
    
    # Layer initialization
    def __init__(self, n_inputs, n_neurons, weight_regularizer_l1=0, weight_regularizer_l2=0, bias_regularizer_l1=0, bias_regularizer_l2=0):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
        # Set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
    # Forward Pass
    def forward(self, inputs):
        self.inputs = inputs # Remeber input values
        self.output = np.dot(inputs, self.weights) + self.biases # Calculate output fom input, weights and biases
        
    # Backward Pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        
        # Gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights <0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases <0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        
        # L2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
        
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

### Complete Loss Class

In [54]:
class Loss:
    # Regularization loss calculation
    def regularization_loss(self, layer):
        regularization_loss = 0
        # L1 & L2 regularizaiton - weights. Calculate only when regularization factor is > 0
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        
        # L1 & L2 regularizaiton - biases. Calculate only when regularization factor is > 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss
    
    def calculate(self, output, y): # Calculates the data and regularization losses given model o/p and ground truth values
        sample_losses = self.forward(output, y) # Calculate the sample losses
        data_loss = np.mean(sample_losses) # Calculate the mean loss
        return data_loss

### ReLU Activaiton Class

In [55]:
class Activation_ReLU:
    # Forward Pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    # Backward Pass
    def backward(self, dvalues):
        # Since we need to modify the original variable, make a copy of the values first
        self.dinputs = dvalues.copy()
        # Gradient to be 0 if input values are 0 or negative
        self.dinputs[self.inputs <= 0] = 0

## Softmax Activation

In [56]:
class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities

## Categorical Cross Entropy

In [57]:
class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):        
        samples = len(y_pred)  # Number of samples in a batch
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7) # Clip data to prevent division by 0
        if len(y_true.shape) == 1:
            correct_onfidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            correct_onfidences = np.sum(y_pred_clipped * y_true, axis=1)
        
        # Losses
        negative_log_likelihoods = -np.log(correct_onfidences)
        return negative_log_likelihoods

## Combined Softmax activation and cross-entropy loss for faster backward step

In [58]:
class Activation_Softmax_Loss_CategoricalCrossEntropy:
    # Create activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossEntropy()
    
    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)
    
    # Backward Pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate the gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize the gradient
        self.dinputs = self.dinputs / samples

## Optimizer -- ADAM

In [59]:
class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0, epsilon=1e-7, beta_1=0.9, beta_2=0.999):  # Learning rate of 1 is default for this optimizer
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
            
    # Update parameters
    def update_params(self, layer):
        # If layer does not contain cache arrays, create them and filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
        
        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums =   self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
        
        # Get corrected momentum. Adding 1 in self.iterations as it starts from 0
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        
        # Update Cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

	
        # Parameter update  + normalization with square root of cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after the parameter updates
    def post_update_params(self):
        self.iterations += 1

## Training a simple NN with and without Regularization

### Without Regularization

In [65]:
# Create the dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense Layer with 2 inputs features and 64 output values
dense1 = Layer_Dense(2, 64)

# Create ReLU activation
activation1 = Activation_ReLU()

# Create 2nd Dense layer with 64 input features (previous layer has 64 outputs) and 3 output values (spiral dataset has 3 categories)
dense2 = Layer_Dense(64,3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# Create Optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Forward Pass through first layer
    dense1.forward(X)
    activation1.forward(dense1.output)
    
    # Forward Pass through second layer
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss
    
    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 1000:
        print(f"Epoch: {epoch}, " + f"Accuracy: {accuracy:.3f}, " + f"Loss: {loss:.3f} "  + f"Data Loss: {data_loss:.3f} "  + 
              f"Regularizatio Loss: {regularization_loss:.3f} " + f"Learning Rate: {optimizer.current_learning_rate:.3f}")
    
    # Backward Pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

# Validate the model
# Create the test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Validation Accuracy: {accuracy:.3f}, Loss :{loss:.3f}')

Epoch: 0, Accuracy: 0.343, Loss: 1.099 Data Loss: 1.099 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 1000, Accuracy: 0.857, Loss: 0.375 Data Loss: 0.375 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 2000, Accuracy: 0.900, Loss: 0.252 Data Loss: 0.252 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 3000, Accuracy: 0.917, Loss: 0.209 Data Loss: 0.209 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 4000, Accuracy: 0.913, Loss: 0.188 Data Loss: 0.188 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 5000, Accuracy: 0.923, Loss: 0.172 Data Loss: 0.172 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 6000, Accuracy: 0.923, Loss: 0.160 Data Loss: 0.160 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 7000, Accuracy: 0.933, Loss: 0.152 Data Loss: 0.152 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 8000, Accuracy: 0.940, Loss: 0.145 Data Loss: 0.145 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 9000, Accuracy: 0.940, Loss: 0.143 Data Lo

### With Regularization

In [66]:
# Create the dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense Layer with 2 inputs features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation
activation1 = Activation_ReLU()

# Create 2nd Dense layer with 64 input features (previous layer has 64 outputs) and 3 output values (spiral dataset has 3 categories)
dense2 = Layer_Dense(64,3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# Create Optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Forward Pass through first layer
    dense1.forward(X)
    activation1.forward(dense1.output)
    
    # Forward Pass through second layer
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss
    
    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 1000:
        print(f"Epoch: {epoch}, " + f"Accuracy: {accuracy:.3f}, " + f"Loss: {loss:.3f} "  + f"Data Loss: {data_loss:.3f} "  + 
              f"Regularizatio Loss: {regularization_loss:.3f} " + f"Learning Rate: {optimizer.current_learning_rate:.3f}")
    
    # Backward Pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

# Validate the model
# Create the test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Validation Accuracy: {accuracy:.3f}, Loss :{loss:.3f}')

Epoch: 0, Accuracy: 0.340, Loss: 1.099 Data Loss: 1.099 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 1000, Accuracy: 0.860, Loss: 0.466 Data Loss: 0.371 Regularizatio Loss: 0.095 Learning Rate: 0.020
Epoch: 2000, Accuracy: 0.867, Loss: 0.397 Data Loss: 0.307 Regularizatio Loss: 0.090 Learning Rate: 0.020
Epoch: 3000, Accuracy: 0.913, Loss: 0.346 Data Loss: 0.263 Regularizatio Loss: 0.083 Learning Rate: 0.020
Epoch: 4000, Accuracy: 0.920, Loss: 0.320 Data Loss: 0.243 Regularizatio Loss: 0.077 Learning Rate: 0.020
Epoch: 5000, Accuracy: 0.930, Loss: 0.305 Data Loss: 0.229 Regularizatio Loss: 0.076 Learning Rate: 0.020
Epoch: 6000, Accuracy: 0.933, Loss: 0.288 Data Loss: 0.216 Regularizatio Loss: 0.072 Learning Rate: 0.020
Epoch: 7000, Accuracy: 0.923, Loss: 0.272 Data Loss: 0.204 Regularizatio Loss: 0.068 Learning Rate: 0.020
Epoch: 8000, Accuracy: 0.933, Loss: 0.261 Data Loss: 0.194 Regularizatio Loss: 0.067 Learning Rate: 0.020
Epoch: 9000, Accuracy: 0.937, Loss: 0.251 Data Lo

## Effect of More Training Data with Regularization

In [67]:
# Create the dataset
X, y = spiral_data(samples=1000, classes=3)

# Create Dense Layer with 2 inputs features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation
activation1 = Activation_ReLU()

# Create 2nd Dense layer with 64 input features (previous layer has 64 outputs) and 3 output values (spiral dataset has 3 categories)
dense2 = Layer_Dense(64,3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# Create Optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Forward Pass through first layer
    dense1.forward(X)
    activation1.forward(dense1.output)
    
    # Forward Pass through second layer
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss
    
    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 1000:
        print(f"Epoch: {epoch}, " + f"Accuracy: {accuracy:.3f}, " + f"Loss: {loss:.3f} "  + f"Data Loss: {data_loss:.3f} "  + 
              f"Regularizatio Loss: {regularization_loss:.3f} " + f"Learning Rate: {optimizer.current_learning_rate:.3f}")
    
    # Backward Pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

# Validate the model
# Create the test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Validation Accuracy: {accuracy:.3f}, Loss :{loss:.3f}')

Epoch: 0, Accuracy: 0.335, Loss: 1.099 Data Loss: 1.099 Regularizatio Loss: 0.000 Learning Rate: 0.020
Epoch: 1000, Accuracy: 0.860, Loss: 0.503 Data Loss: 0.409 Regularizatio Loss: 0.094 Learning Rate: 0.020
Epoch: 2000, Accuracy: 0.872, Loss: 0.430 Data Loss: 0.349 Regularizatio Loss: 0.081 Learning Rate: 0.020
Epoch: 3000, Accuracy: 0.878, Loss: 0.399 Data Loss: 0.327 Regularizatio Loss: 0.072 Learning Rate: 0.020
Epoch: 4000, Accuracy: 0.885, Loss: 0.373 Data Loss: 0.306 Regularizatio Loss: 0.066 Learning Rate: 0.020
Epoch: 5000, Accuracy: 0.889, Loss: 0.358 Data Loss: 0.296 Regularizatio Loss: 0.062 Learning Rate: 0.020
Epoch: 6000, Accuracy: 0.888, Loss: 0.345 Data Loss: 0.287 Regularizatio Loss: 0.057 Learning Rate: 0.020
Epoch: 7000, Accuracy: 0.887, Loss: 0.336 Data Loss: 0.283 Regularizatio Loss: 0.053 Learning Rate: 0.020
Epoch: 8000, Accuracy: 0.890, Loss: 0.328 Data Loss: 0.278 Regularizatio Loss: 0.050 Learning Rate: 0.020
Epoch: 9000, Accuracy: 0.890, Loss: 0.323 Data Lo

In [78]:
np.random.binomial(3, 0.8, size=10)

array([3, 3, 2, 3, 2, 2, 3, 3, 3, 2])

## Dropout Layer

In [79]:
class Layer_Dropout:
    # Initialize the dropout layer
    def __init__(self, rate):
        # Store the dropout rate. Invert it to get the success rate
        # For example, for a dropout of 0.1, we need a success rate of 0.9
        self.rate = 1 - rate
    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs # Save the input values
        self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate # Generate and save the scaled binary mask
        self.output = inputs * self.binary_mask # Apply mask to output values
    # Backward pass
    def backward(self, dvalues):
        self.dinputs = dvalues * self.binary_mask

In [82]:
# Create the dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense Layer with 2 inputs features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation
activation1 = Activation_ReLU()

# Create a dropout layer
dropout1 = Layer_Dropout(0.1)

# Create 2nd Dense layer with 64 input features (previous layer has 64 outputs) and 3 output values (spiral dataset has 3 categories)
dense2 = Layer_Dense(64,3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# Create Optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)

# Train in loop
for epoch in range(10001):
    # Forward Pass through first layer
    dense1.forward(X)
    activation1.forward(dense1.output)

    # Perform a forward pass through Dropout layer
    dropout1.forward(activation1.output)
    
    # Forward Pass through second layer
    dense2.forward(dropout1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

    # Calculate overall loss
    loss = data_loss + regularization_loss
    
    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 1000:
        print(f"Epoch: {epoch}, " + f"Accuracy: {accuracy:.3f}, " + f"Loss: {loss:.3f} "  + f"Data Loss: {data_loss:.3f} "  + 
              f"Regularizatio Loss: {regularization_loss:.3f} " + f"Learning Rate: {optimizer.current_learning_rate:.3f}")
    
    # Backward Pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    dropout1.backward(dense2.dinputs)
    activation1.backward(dropout1.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

# Validate the model
# Create the test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Validation Accuracy: {accuracy:.3f}, Loss :{loss:.3f}')

Epoch: 0, Accuracy: 0.390, Loss: 1.099 Data Loss: 1.099 Regularizatio Loss: 0.000 Learning Rate: 0.050
Epoch: 1000, Accuracy: 0.690, Loss: 0.721 Data Loss: 0.672 Regularizatio Loss: 0.049 Learning Rate: 0.048
Epoch: 2000, Accuracy: 0.700, Loss: 0.705 Data Loss: 0.657 Regularizatio Loss: 0.048 Learning Rate: 0.045
Epoch: 3000, Accuracy: 0.700, Loss: 0.697 Data Loss: 0.648 Regularizatio Loss: 0.048 Learning Rate: 0.043
Epoch: 4000, Accuracy: 0.770, Loss: 0.678 Data Loss: 0.628 Regularizatio Loss: 0.049 Learning Rate: 0.042
Epoch: 5000, Accuracy: 0.740, Loss: 0.683 Data Loss: 0.634 Regularizatio Loss: 0.048 Learning Rate: 0.040
Epoch: 6000, Accuracy: 0.743, Loss: 0.631 Data Loss: 0.581 Regularizatio Loss: 0.050 Learning Rate: 0.038
Epoch: 7000, Accuracy: 0.747, Loss: 0.608 Data Loss: 0.557 Regularizatio Loss: 0.051 Learning Rate: 0.037
Epoch: 8000, Accuracy: 0.753, Loss: 0.603 Data Loss: 0.550 Regularizatio Loss: 0.052 Learning Rate: 0.036
Epoch: 9000, Accuracy: 0.733, Loss: 0.611 Data Lo