# Neural Networks from scratch in Python 

In [24]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

### Dense Layer

In [58]:
class Layer_Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases  = np.zeros((1, n_neurons))

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradient on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases  = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs  = np.dot(dvalues, self.weights.T)

### ReLU activation

In [59]:
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs

        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify the original variable, 
        # let's make a copy of the variable first 
        self.dinputs = dvalues.copy()

        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

### Softmax activation

In [27]:
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1,1)
            # Calculate Jacobian matrix of the output ...
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

### Common loss class

In [63]:
class Loss:

    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        return data_loss

### Cross-entropy loss (subclass to Loss)

In [60]:
class Loss_CategoricalCrossentropy(Loss):

    # Forwad pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)

        # Probabilities for target values - 
        # only if categorical labels
        # correct_confidences = 0  # Remove warning!

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        
        else:
            print("ERROR!")
            correct_confidences = 0

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample.
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector 
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        # Calculate gradient
        self.dinputs = -y_true / dvalues

        # Normalize gradient
        self.dinputs = self.dinputs / samples

### Activation_Softmax_Loss_CategoricalCrossEntropy

In [40]:
# softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():

    # Creates activation and loss funtion objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss       = Loss_CategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)

    # Backward pass
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2: 
            y_true = np.argmax(y_true, axis=1)
            
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

### Create Dataset and our model

In [31]:
X,y = spiral_data(samples=100, classes=3)

In [32]:
dense1 = Layer_Dense(2,3)     # Dense layer 1, 2 input features and 3 output values

activation1 = Activation_ReLU()

dense2 = Layer_Dense(3,3)      # Dense Layer 2, 3 input features and 3 output values

activation2 = Activation_Softmax()

loss_function = Loss_CategoricalCrossentropy()

### Pass data through the layers

In [33]:
dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

# Let's see output of the first few samples
print(activation2.output[:5])


[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]


In [34]:
# Perform a forward pass through loss function
loss = loss_function.calculate(activation2.output, y)

print('loss: ', loss)

loss:  1.0986104


## Accuracy Calculation

In [35]:
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(activation2.output, axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print accuracy
print('acc: ', accuracy)

acc:  0.34


In [44]:
# Check if we get the same values with the combined activation for 
# Softmax and CategoricalCrossEntropy as with not combined
import numpy as np
from timeit import timeit
import nnfs

nnfs.init()

softmax_outputs = np.array([
    [0.7, 0.1, 0.2],
    [0.1, 0.5, 0.4],
    [0.02, 0.9, 0.08]])

class_targets = np.array([0,1,1])

def f1():
    softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
    softmax_loss.backward(softmax_outputs, class_targets)
    dvalues1 = softmax_loss.dinputs

def f2():
    activation = Activation_Softmax()
    activation.output = softmax_outputs
    loss = Loss_CategoricalCrossentropy()
    loss.backward(softmax_outputs, class_targets)
    activation.backward(loss.dinputs)
    dvalues2 = activation.dinputs

t1 = timeit(lambda: f1(), number=10000)
t2 = timeit(lambda: f2(), number=10000)

print('Gradients: combined loss and activation:')
print(dvalues1)
print('Gradients: separate loss and activation:')
print(dvalues2)
print(f'Time: t1={t1}, t2={t2}, t2/t1={t2/t1}')

Gradients: combined loss and activation:
[[-0.1         0.03333333  0.06666667]
 [ 0.03333333 -0.16666667  0.13333333]
 [ 0.00666667 -0.03333333  0.02666667]]
Gradients: separate loss and activation:
[[-0.09999999  0.03333334  0.06666667]
 [ 0.03333334 -0.16666667  0.13333334]
 [ 0.00666667 -0.03333333  0.02666667]]
Time: t1=0.34074719999625813, t2=1.1670699000096647, t2/t1=3.425031519033702


### Full model code with loss calculation

In [46]:
# Create dataset
X,y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create ReLU activation
activation1 = Activation_ReLU()

# Create 2nd Dence layer with 3 inputs features and 3 output values
dense2 = Layer_Dense(3, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

In [61]:
# Perform a forward pass of our training data through this layer
dense1.forward(X)

# Perform a forward pass through activation function.
activation1.forward(dense1.output)

# Perform a forward pass through the 2nd Dense layer
dense2.forward(activation1.output)

# Perform a forward pass through the activiation/loss funtion
loss = loss_activation.forward(dense2.output, y)

# Let's see the output of the first few samples
print(loss_activation.output[:5])

# Print loss value
print('loss: ', loss)

# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions==y)
print('acc: ', accuracy)


[[0.33333334 0.33333334 0.33333334]
 [0.33333355 0.33333322 0.3333332 ]
 [0.33333382 0.33333313 0.3333331 ]
 [0.3333341  0.33333302 0.33333293]
 [0.33333433 0.3333329  0.33333278]]
loss:  1.0986081
acc:  0.33666666666666667


In [62]:
# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# print gradients
print(f'dense1.dweights\n : {dense1.dweights}\n' ) 
print(f'dense1.dbiases \n : {dense1.dbiases} \n' ) 
print(f'dense2.dweights\n : {dense2.dweights}\n' ) 
print(f'dense2.dbiases \n : {dense2.dbiases} \n' ) 

dense1.dweights
 : [[ 3.3042497e-06 -3.9488214e-06 -9.9410361e-05]
 [-2.2006869e-05  3.0671345e-04  1.6974623e-04]]

dense1.dbiases 
 : [[-1.8163289e-05 -5.1999162e-04  1.4667885e-05]] 

dense2.dweights
 : [[ 9.1446236e-05 -2.5220119e-04  1.6075491e-04]
 [-1.7278348e-04  3.9700870e-04 -2.2422521e-04]
 [ 4.4883698e-05 -1.2783038e-04  8.2946666e-05]]

dense2.dbiases 
 : [[ 4.6649948e-06 -8.3957566e-06  3.5949051e-06]] 



## Chapter 10: Optimizier

### Stochastic Gradient Descent (SGD)