# Simple one-neuron backpropagation
To simplify things further, let's find the influence one of the inputs has on ReLU output of the neuron

In [176]:
import numpy as np

In [50]:
#
#Let's consider our neuron as a big function
# y = ReLU (
#        sum( 
#            mul(
#                    i_1,
#                    w_1
#                ),
#                
#            mul(
#                    i_2,
#                    w_2
#                ),
#            
#            mul(
#                i_3,
#                w_3
#            ),
#            
#            bias 
#        ) 
#    )
#
# 

In [51]:
weights = [1, 2, 3]
inputs = [1,2,3]
bias = 0.3

xw1 = weights[0] * inputs[0]
xw2 = weights[1] * inputs[1]
xw3 = weights[2] * inputs[2]

z = xw1 + xw2 + xw3 + bias

y = max(z, 0)

# Suppose that we have the gradient from the layer preceding our y
dvalue = 1

# The derivative of ReLU w.r.t computed value before ReLU (our sum)
d_relu_d_output = dvalue * (1 if z > 0 else 0)

d_relu_d_mul = d_relu_d_output * 1 # Since all other arguments except for the sum will be 0 and the partial derivative w.r.t the variable is 1

d_relu_d_w0 = d_relu_d_mul * inputs[0]
 
# =>
d_relu_d_w0 = dvalue * (1 if z > 0 else 0) * 1 * inputs[0]
# =>
d_relu_d_w0 = dvalue * (1 if z > 0 else 0) * inputs[0]


In [52]:
# Now that we have the answer to "How does the ReLU (output) of the function change when we change its weight",
# we can try and lower the resulting value of the function to see if it's working
d_relu_d_w0 = dvalue * (1 if z > 0 else 0) * inputs[0]
d_relu_d_w1 = dvalue * (1 if z > 0 else 0) * inputs[1]
d_relu_d_w2 = dvalue * (1 if z > 0 else 0) * inputs[2]

xw1 = weights[0] * inputs[0]
xw2 = weights[1] * inputs[1]
xw3 = weights[2] * inputs[2]
z = xw1 + xw2 + xw3 + bias
y = max(z, 0)

print(f'Before applying "optimizer":\n\tWeights: {weights}\n\tReLU output: {y}\n')

weights[0] += -0.001 * d_relu_d_w0
weights[1] += -0.001 * d_relu_d_w1
weights[2] += -0.001 * d_relu_d_w2

xw1 = weights[0] * inputs[0]
xw2 = weights[1] * inputs[1]
xw3 = weights[2] * inputs[2]
z = xw1 + xw2 + xw3 + bias
y = max(z, 0)

print(f'After applying "optimizer":\n\tWeights: {weights}\n\tReLU output: {y}')

Before applying "optimizer":
	Weights: [1, 2, 3]
	ReLU output: 14.3

After applying "optimizer":
	Weights: [0.999, 1.998, 2.997]
	ReLU output: 14.286000000000001


In [171]:
# Passed in gradient from the next layer
# For each of the three samples, it contains the
# derivatives of subsequent neurons w.r.t their input (0, 1, 2) - the neurons in the current layer
# Row - sample
# Column - derivative w.r.t. the neuron
dvalues = np.array([
    [1., 1., 1.],
    [2., 2., 2.],
    [3., 3., 3.]
])

# We have 3 sets of inputs - samples
inputs = np.array([
    [1,      2,   3,  2.5],
    #[2.,    5., -1.,    2],
    #[-1.5, 2.7, 3.3, -0.8]
])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([
    [0.2,     0.8,  -0.5,    1],
    [0.5,   -0.91,  0.26, -0.5],
    [-0.26, -0.27, 0.17,  0.87]
]).T

# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([
    [2, 3, 0.5]
])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases # Dense layer
relu_outputs = np.maximum(0, layer_outputs) # ReLU activation

# Let's optimize and test backpropagation here

# ReLU derivative
# from next layer passed to current layer during backpropagation
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0

# Dense layer
# dinputs - multiply by weights
dinputs = np.dot(drelu, weights.T)

# dweights - multiply by inputs
dweights = np.dot(inputs.T, drelu)

print(dweights)

# dbiases - sum values, do this over samples (first axis), keepdims
# since this by default will produce a plain list -
# we explained this in the chapter 4
dbiases = np.sum(drelu, axis=0, keepdims=True)

# Update parameters
weights += -0.001 * dweights
biases += -0.001 * dbiases

# print(weights)
# print(biases)

[[ 4.8     1.21    2.385 ]
 [ 9.6     2.42    4.77  ]
 [14.4     3.63    7.155 ]
 [12.      3.025   5.9625]]


In [182]:
inputs = np.array([
    [1,      2,   3,  2.5],
    #[2.,    5., -1.,    2],
    #[-1.5, 2.7, 3.3, -0.8]
])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([
    [0.2,     0.8,  -0.5,    1],
    [0.5,   -0.91,  0.26, -0.5],
    [-0.26, -0.27, 0.17,  0.87]
]).T

# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([
    [2, 3, 0.5]
])

# Since I do not know numpy (it's 05.02.2022, I probably have changed already), we're going to omit numpy for calculation
# to calculate each gradient we'll need to multiply the derivative of activation function by each weight
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0 # Get derivative of activation function

dweights = []

# For each sample
for sample in inputs:
    for neuron_weights, bias in zip(weights.T, biases[0]):
        neuron_value = np.maximum(0, np.dot(sample, neuron_weights) + bias)
        drelu = neuron_value if neuron_value > 0 else 0
        
        dweights.append([round(drelu * inp, 2) for inp in sample])

dweights = np.array(dweights).T
dweights

array([[ 4.8 ,  1.21,  2.38],
       [ 9.6 ,  2.42,  4.77],
       [14.4 ,  3.63,  7.15],
       [12.  ,  3.02,  5.96]])