### Model class

In [14]:
import matplotlib.pyplot as plt
import nnfs
from nnfs.datasets import sine_data
import numpy as np
# import cupy as np
import time

nnfs.init()

### Dense Layer

In [15]:
class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons,
                weight_regularizer_l1=0., weight_regularizer_l2=0., 
                bias_regularizer_l1=0., bias_regularizer_l2=0.):

        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases  = np.zeros((1, n_neurons))
        self.dinputs = 0
        
        # Set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1   = bias_regularizer_l1
        self.bias_regularizer_l2   = bias_regularizer_l2

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradient on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases  = np.sum(dvalues, axis=0, keepdims=True)
        
        # Gradient on regularization
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1

        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
        
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1

        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
        
        # Gradient on values
        self.dinputs  = np.dot(dvalues, self.weights.T)

### Input 'Layer' - Dummy layer to make the programming easier

In [16]:
class Layer_Input:
    # Forward pass
    def forward(self, inputs):
        self.output = inputs

### ReLU activation

In [17]:
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs

        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)
        self.dinputs = 0

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify the original variable, 
        # let's make a copy of the variable first 
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

### Optimizer Adam - [Adaptive Momentum]

In [18]:
class Optimizer_Adam:

    # Initialize optimizer
    def __init__(self, learning_rate=0.001, decay=0.0, epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate          = learning_rate
        self.current_learning_rate  = learning_rate
        self.decay                  = decay
        self.iterations             = 0
        self.epsilon                = epsilon
        self.beta_1                 = beta_1
        self.beta_2                 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = \
                self.learning_rate * (1.0 / (1.0+self.decay * self.iterations))
    
    # Update parameters
    def update_params(self, layer):

        # Create cache arrays if not exist
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache     = np.zeros_like(layer.weights)
            layer.bias_momentums   = np.zeros_like(layer.biases)
            layer.bias_cache       = np.zeros_like(layer.biases)

        # Update momentum with current gradients
        layer.weight_momentums = \
            self.beta_1 * layer.weight_momentums + (1-self.beta_1)*layer.dweights
        layer.bias_momentums = \
            self.beta_1 * layer.bias_momentums   + (1-self.beta_1)*layer.dbiases

        # Get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations+1))
        bias_momentums_corrected   = layer.bias_momentums   / (1 - self.beta_1 ** (self.iterations+1))

        # Update cache with squared current gradients
        layer.weight_cache = \
            self.beta_2 * layer.weight_cache + (1-self.beta_2)*layer.dweights**2
        layer.bias_cache = \
            self.beta_2 * layer.bias_cache   + (1-self.beta_2)*layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1-self.beta_2**(self.iterations+1))
        bias_cache_corrected   = layer.bias_cache   / (1-self.beta_2**(self.iterations+1))

        # Vanilla SGD parameter update + normalization with sqrt cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases  += -self.current_learning_rate * bias_momentums_corrected   / (np.sqrt(bias_cache_corrected)   + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

### Linear Activation

In [19]:
# Linear Activation
class Activation_Linear:

    # Forward pass
    def forward(self, inputs): 
        self.inputs = inputs
        self.output = inputs

    # Backward pass
    def backward(self, dvalues):
        # Derivative is 1, 1*dvalues = dvalues - the chain rule
        self.dinputs = dvalues.copy()

### Common Loss class

In [20]:
class Loss:

    # Regularization loss calculation
    def regularization_loss(self, layer):
        
        regularization_loss = 0        
        
        # L1/L2 regularization (if factor greater than 0)
        
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
        
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
        
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
        
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss
    
    # Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers


    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        return np.mean(sample_losses)

    def forward(self, y_pred, y_true):
        return []

### Mean Squared Error loss

In [21]:
class Loss_MeanSquaredError(Loss):   # L2 Loss
    
    # Forward pass
    def forward(self, y_pred, y_true): 
        # Calculate loss
        sample_losses = np.mean((y_true - y_pred)**2, axis=-1)
        return sample_losses
 
    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        outputs = len(dvalues[0])
        
        # Gradient on values
        self.dinputs = -2 * (y_true - dvalues) / outputs
        self.dinputs = self.dinputs / samples

### Mean Absolute Error Loss

In [22]:
class Loss_MeanAbsoluteError(Loss):   # L1 loss

    # Forward pass
    def forward(self, y_pred, y_true):

        # Calculate loss
        sample_losses = np.mean(np.abs(y_true - y_pred), axis=-1)
        return sample_losses

    # Backward pass
    def backward(self, dvalues, y_true):

        samples = len(dvalues)
        outputs = len(dvalues[0])

        # Calculate gradient
        self.dinputs = np.sign(y_true - dvalues) / outputs

        # Normalize gradient
        self.dinputs = self.dinputs / samples
        

In [23]:
class Model:

    def __init__(self):
        # Create a list of network objects
        self.layers = []

    # Add objects to the model
    def add(self, layer):
        self.layers.append(layer)

    def set(self, *, loss, optimizer):
        self.loss      = loss
        self.optimizer = optimizer

    def finalize(self):

        # Create and set the input layer
        self.input_layer = Layer_Input()

        # Count all the objects 
        layer_count = len(self.layers)

        # Initialize a list containing trainable layers
        self.trainable_layers = []

        # Iterate the objects
        for i in range(layer_count):

            # If first layer, then use 'Layer_Input' as first layer
            if i==0:
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i+1]
            elif i < layer_count -1:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.layers[i+1]
            else:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.loss

            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])

    def forward(self, X):
        # Mine is faster, of course .... and better. Just saying :) 
        self.input_layer.forward(X)    # Our dummy layer, fill the output parameter

        for layer in self.layers:
            layer.forward(layer.prev.output)
        
        return layer.output

    def train(self, X, y, *, epochs=1, print_every=1):

        # Main training loop
        for epoch in range(1,epochs+1):
        
            # Forward pass
            output = self.forward(X)

            # Temp
        print(output)
            

In [24]:
# Create Dataset
X, y = sine_data()

model = Model()

# Instantiate the model
model.add(Layer_Dense(1,64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64,64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64,1))
model.add(Activation_Linear())

print("1")
# Set the loss and optimizer objects
model.set(
    loss = Loss_MeanSquaredError(),
    optimizer=Optimizer_Adam(learning_rate=0.005, decay=1e-3))

print("2")
model.finalize()    # This is very bad coding!

print("3")
model.train(X,y, epochs=100, print_every=10)

1
2
3
[[ 0.00000000e+00]
 [-1.13209149e-08]
 [-2.26418297e-08]
 [-3.39627420e-08]
 [-4.52836595e-08]
 [-5.66045735e-08]
 [-6.79254839e-08]
 [-7.92463979e-08]
 [-9.05673190e-08]
 [-1.01888233e-07]
 [-1.13209147e-07]
 [-1.24530047e-07]
 [-1.35850968e-07]
 [-1.47171875e-07]
 [-1.58492796e-07]
 [-1.69813731e-07]
 [-1.81134638e-07]
 [-1.92455531e-07]
 [-2.03776466e-07]
 [-2.15097387e-07]
 [-2.26418294e-07]
 [-2.37739215e-07]
 [-2.49060093e-07]
 [-2.60381057e-07]
 [-2.71701936e-07]
 [-2.83022899e-07]
 [-2.94343749e-07]
 [-3.05664656e-07]
 [-3.16985592e-07]
 [-3.28306527e-07]
 [-3.39627462e-07]
 [-3.50948341e-07]
 [-3.62269276e-07]
 [-3.73590098e-07]
 [-3.84911061e-07]
 [-3.96232025e-07]
 [-4.07552932e-07]
 [-4.18873867e-07]
 [-4.30194774e-07]
 [-4.41515681e-07]
 [-4.52836588e-07]
 [-4.64157495e-07]
 [-4.75478430e-07]
 [-4.86799308e-07]
 [-4.98120187e-07]
 [-5.09441122e-07]
 [-5.20762114e-07]
 [-5.32082936e-07]
 [-5.43403871e-07]
 [-5.54724693e-07]
 [-5.66045799e-07]
 [-5.77366677e-07]
 [-5.8