# Simple one-neuron backpropagation
To simplify things further, let's find the influence one of the inputs has on ReLU output of the neuron

In [2]:
import numpy as np

In [3]:
inputs = [1, -2, 3]

bias = 1
weights = [-3, -1, 2]

def get_output(weights, bias):
    xw0 = weights[0] * inputs[0]
    xw1 = weights[1] * inputs[1]
    xw2 = weights[2] * inputs[2]
    z = xw0 + xw1 + xw2 + bias
    # Apply ReLU
    y = max(z, 0)
    return y

y = get_output(weights, bias)

print('Neuron output:', y)

Neuron output: 6


## Version 1
All calculations are performed explicitly

In [4]:
def improve(weights, bias, z):
    # Now start computing derivatives
    # Suppose that we received dvalue from the previous layer
    dvalue = 1

    # Calculate the derivative of the ReLU w.r.t the sum
    d_relu_d_sum = 1 if z else 0

    # Calculate the derivative of sum w.r.t. multiplication
    d_sum_d_mul = 1

    # Calculate the derivative w.r.t bias
    d_sum_d_bias = 1

    # Calculate the derivative of multiplication w.r.t weight ...
    d_mul_d_w1 = inputs[0]
    d_mul_d_w2 = inputs[1]
    d_mul_d_w3 = inputs[2]

    # and input
    d_mul_d_i1 = weights[0]
    d_mul_d_i2 = weights[1]
    d_mul_d_i3 = weights[2]

    # Group derivatives
    d_w1 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_w1
    d_w2 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_w2
    d_w3 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_w3

    d_i1 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_i1
    d_i2 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_i2
    d_i3 = dvalue * d_relu_d_sum * d_sum_d_mul * d_mul_d_i3

    d_bias = dvalue * d_relu_d_sum * d_sum_d_bias

    learning_rate = 0.001

    weights[0] -= learning_rate * d_w1
    weights[1] -= learning_rate * d_w2
    weights[2] -= learning_rate * d_w3

    bias -= learning_rate * d_bias

In [5]:
z = get_output(weights, bias)
print(f'Before improvement:', z)

for i in range(50):
    improve(weights, bias, z)
    z2 = get_output(weights, bias)
    print(f'After "epoch" #{i+1}: {z2:.3f}')

Before improvement: 6
After "epoch" #1: 5.986
After "epoch" #2: 5.972
After "epoch" #3: 5.958
After "epoch" #4: 5.944
After "epoch" #5: 5.930
After "epoch" #6: 5.916
After "epoch" #7: 5.902
After "epoch" #8: 5.888
After "epoch" #9: 5.874
After "epoch" #10: 5.860
After "epoch" #11: 5.846
After "epoch" #12: 5.832
After "epoch" #13: 5.818
After "epoch" #14: 5.804
After "epoch" #15: 5.790
After "epoch" #16: 5.776
After "epoch" #17: 5.762
After "epoch" #18: 5.748
After "epoch" #19: 5.734
After "epoch" #20: 5.720
After "epoch" #21: 5.706
After "epoch" #22: 5.692
After "epoch" #23: 5.678
After "epoch" #24: 5.664
After "epoch" #25: 5.650
After "epoch" #26: 5.636
After "epoch" #27: 5.622
After "epoch" #28: 5.608
After "epoch" #29: 5.594
After "epoch" #30: 5.580
After "epoch" #31: 5.566
After "epoch" #32: 5.552
After "epoch" #33: 5.538
After "epoch" #34: 5.524
After "epoch" #35: 5.510
After "epoch" #36: 5.496
After "epoch" #37: 5.482
After "epoch" #38: 5.468
After "epoch" #39: 5.454
After "epoch

## Version 2
Removed variables that are **always** 1

In [6]:
# Let's see if we can improve some calculations
def improve(weights, bias, z):
    # Now start computing derivatives
    # Suppose that we received dvalue from the previous layer
    dvalue = 1

    # Calculate the derivative of the ReLU w.r.t the sum
    d_relu_d_sum = 1 if z else 0

    # Calculate the derivative of multiplication w.r.t weight ...
    d_mul_d_w1 = inputs[0]
    d_mul_d_w2 = inputs[1]
    d_mul_d_w3 = inputs[2]

    # and input
    d_mul_d_i1 = weights[0]
    d_mul_d_i2 = weights[1]
    d_mul_d_i3 = weights[2]

    # Group derivatives
    d_w1 = dvalue * d_relu_d_sum * d_mul_d_w1
    d_w2 = dvalue * d_relu_d_sum * d_mul_d_w2
    d_w3 = dvalue * d_relu_d_sum * d_mul_d_w3

    d_i1 = dvalue * d_relu_d_sum * d_mul_d_i1
    d_i2 = dvalue * d_relu_d_sum * d_mul_d_i2
    d_i3 = dvalue * d_relu_d_sum * d_mul_d_i3

    d_bias = dvalue * d_relu_d_sum

    learning_rate = 0.001

    weights[0] -= learning_rate * d_w1
    weights[1] -= learning_rate * d_w2
    weights[2] -= learning_rate * d_w3

    bias -= learning_rate * d_bias

### About dot product

In [91]:
"""
If a is an N-D array and b is a 1-D array, it is a sum product over the last axis of a and b.

If a is an N-D array and b is an M-D array (where M>=2), it is a sum product
over the last axis of a and the second-to-last axis of b
"""
inputs = np.array([
    [1., -2., 3.],
    [-1., 2., -3.],
])

d_relu=np.array([
    [6.],
    [6.],
])

# We were calculating dweights - the derivative of activation function w.r.t. inputs.
# If we have multiple inputs for current neuron we expect to see them as rows in our matrix

# The derivative is calculated by multiplying each input by derivatives of the activation function OF EACH INPUT


ReLU function derivative returns 1 or 0 for each sum, so it returns a np.array with `n` rows and 1 column for `n` inputs. When we multiply it by the `dvalues` we get the following array as the value of `d_relu`:
```Python
[[6.]
 [0.]]
```
Each row corresponds to an input entry. Thus, to get the product of `d_relu` and `inputs` (the `dweights`) we have to multiply `d_relu` by transposed `inputs`, BUT according to the rules above (**numpy determines the shape by the first element**), we have to move inputs to the first place to be compliant with case 1

In [92]:
dweights = np.dot(inputs.T, d_relu)

# Version 3

Let's consider the most complex situation: we have **2 neurons**, each has **3 inputs** and we have **4 samples**

In [357]:
inputs = np.array([
    [1.,   2.,  3.],
    [4.,   5.,  6.],
    [7.,   8.,  9.],
    [10., 11., 12.]
])

weights = np.array([
    [0.1, 0.1, 0.1],
    [0.2, 0.2, 0.2]
])

biases = np.array([
    [1, 1.01]
])

First, we perform **forward pass**

Obtain the value of the neuron:

In [358]:
neuron_outputs = np.dot(inputs, weights.T) + biases
neuron_outputs

array([[1.6 , 2.21],
       [2.5 , 4.01],
       [3.4 , 5.81],
       [4.3 , 7.61]])

And apply activation function to it:

In [359]:
values = np.maximum(neuron_outputs, 0)
# Columns are neurons and rows are their outputs for each sample
values

array([[1.6 , 2.21],
       [2.5 , 4.01],
       [3.4 , 5.81],
       [4.3 , 7.61]])

Second, perform backward pass. For simplicity let's try to decrease the output of this neuron.
We do not use the `dvalues` just yet!

Find the derivative of the ReLU function, which is 1 for any positive value and 0 for anything equal to or less than 0

In [360]:
d_relu = np.where(values > 0, 1, 0)
d_relu

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 1]])

Finding `dweights` - the measure of how the changes in the weight affect the value of the neuron. Resulting array has the gradients for each neuron for each input.

In [361]:
dweights = np.dot(inputs.T, d_relu)
dweights

array([[22., 22.],
       [26., 26.],
       [30., 30.]])

Let's calculate `dinputs` - the measure of how the value of the neuron changed when its input changed
For each input (the row), columns contain the gradient in given sample<br>
**I'm kinda uncertain about this**

In [362]:
dinputs = np.dot(d_relu, weights)
dinputs

array([[0.3, 0.3, 0.3],
       [0.3, 0.3, 0.3],
       [0.3, 0.3, 0.3],
       [0.3, 0.3, 0.3]])

\[**Something I'm uncertain about**\] Since we need to pass `dinputs` to previous layer and this layer expects a 1D array where each column corresponds to the value of the gradient that signifies the change in layer output when the inputs change

In [363]:
dinputs = dinputs.sum(axis=0, keepdims=True)
dinputs

array([[1.2, 1.2, 1.2]])

Let's calculate `dbiases`:

In [364]:
dbiases = d_relu

Let's summarize everything:

In [365]:
inputs = np.array([
    [1.,   2.,  3.],
    [4.,   5.,  6.],
    [7.,   8.,  9.],
    [10., 11., 12.]
], dtype='float32')

weights = np.array([
    [1, 1, 1],
    [2, 2, 2]
], dtype='float32')

biases = np.array([
    [1, 1.01]
])

In [366]:
def forward():
    neuron_outputs = np.dot(inputs, weights.T) + biases
    values = np.maximum(neuron_outputs, 0)
    return values

In [367]:
def backwards(learning_rate=0.001):
    global weights, inputs, biases
    
    d_relu = np.where(values > 0, 1, 0)
    dweights = np.dot(inputs.T, d_relu)
    dbiases = d_relu
    
    weights -= (learning_rate * dweights).T
    biases -= (learning_rate * dbiases).sum(axis=0, keepdims=True)

In [368]:
print('Initial values:\n', forward(), end='\n\n')

for step in range(3):
    backwards()
    print(f'After step {step+1}:\n{forward()}\n')

Initial values:
 [[ 7.   13.01]
 [16.   31.01]
 [25.   49.01]
 [34.   67.01]]

After step 1:
[[ 6.83199997 12.84200044]
 [15.59799928 30.60799928]
 [24.36400003 48.37400003]
 [33.12999887 66.14000269]]

After step 2:
[[ 6.66399993 12.67400089]
 [15.19599952 30.20599857]
 [23.72800006 47.73800006]
 [32.25999965 65.26999774]]

After step 3:
[[ 6.49600037 12.50600037]
 [14.79399976 29.80400166]
 [23.09200009 47.10200391]
 [31.39000043 64.40000424]]

