<a href="https://colab.research.google.com/github/malcolmlett/ml-learning/blob/main/Gradient_understanding_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# For a blog post on gradients within deep neural networks

In [None]:
import numpy as np
import tensorflow as tf
import math
import matplotlib.pyplot as plt

In [None]:
# extremely simple model
# conclusions:
#  - dY/dW is proportional to n
#  - dJ/dW is averaged across all n if that's was the loss function does, however
#  - dJ/dW will be summed across n, or any other operation, depending how the loss is computed.
W = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
X = tf.ones(shape=(15,3))

with tf.GradientTape(persistent=True) as tape:
  Y_pred = tf.matmul(X, W)
  loss1 = tf.reduce_mean(Y_pred)
  loss2 = tf.reduce_sum(Y_pred)

print(f"dY/dW: {tape.gradient(Y_pred, W)}")
print(f"dJ1/dW: {tape.gradient(loss1, W)}")
print(f"dJ2/dW: {tape.gradient(loss2, W)}")

dY/dW: [[15. 15. 15.]
 [15. 15. 15.]
 [15. 15. 15.]]
dJ1/dW: [[0.33333334 0.33333334 0.33333334]
 [0.33333334 0.33333334 0.33333334]
 [0.33333334 0.33333334 0.33333334]]
dJ2/dW: [[15. 15. 15.]
 [15. 15. 15.]
 [15. 15. 15.]]


In [None]:
# multi-layer simple model without bias or activations
W1 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W2 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W3 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
X = tf.ones(shape=(5,3))

with tf.GradientTape(persistent=True) as tape:
  Z1 = tf.matmul(X, W1)
  Z2 = tf.matmul(Z1, W2)  # even if we had an activation function, none of these are any different than for the single layer case;
  Z3 = tf.matmul(Z2, W3)  # so they still have to be SUMMED over n

  Y_pred = Z3
  loss = tf.reduce_mean(Y_pred)

print(f"dZ1/dW1: {tape.gradient(Z1, W1)}")
print(f"dZ2/dW2: {tape.gradient(Z2, W2)}")
print(f"dZ3/dW3: {tape.gradient(Z3, W3)}")
print("--")
print(f"dZ1/dX:  {tape.gradient(Z1, X)}")
print(f"dZ2/dA1: {tape.gradient(Z2, Z1)}")
print(f"dZ3/dA2: {tape.gradient(Z3, Z2)}")
print("--")
print(f"dY/dW1: {tape.gradient(Y_pred, W1)}")
print(f"dY/dW2: {tape.gradient(Y_pred, W2)}")
print(f"dY/dW3: {tape.gradient(Y_pred, W3)}")
print("--")
print(f"dJ/dW1: {tape.gradient(loss, W1)}")
print(f"dJ/dW2: {tape.gradient(loss, W2)}")
print(f"dJ/dW3: {tape.gradient(loss, W3)}")

dZ1/dW1: [[5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]]
dZ2/dW2: [[15. 15. 15.]
 [15. 15. 15.]
 [15. 15. 15.]]
dZ3/dW3: [[45. 45. 45.]
 [45. 45. 45.]
 [45. 45. 45.]]
--
dZ1/dX:  None
dZ2/dA1: [[3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]]
dZ3/dA2: [[3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]]
--
dY/dW1: [[45. 45. 45.]
 [45. 45. 45.]
 [45. 45. 45.]]
dY/dW2: [[45. 45. 45.]
 [45. 45. 45.]
 [45. 45. 45.]]
dY/dW3: [[45. 45. 45.]
 [45. 45. 45.]
 [45. 45. 45.]]
--
dJ/dW1: [[3. 3. 3.]
 [3. 3. 3.]
 [3. 3. 3.]]
dJ/dW2: [[3.0000002 3.0000002 3.0000002]
 [3.0000002 3.0000002 3.0000002]
 [3.0000002 3.0000002 3.0000002]]
dJ/dW3: [[3.0000002 3.0000002 3.0000002]
 [3.0000002 3.0000002 3.0000002]
 [3.0000002 3.0000002 3.0000002]]


In [None]:
# multi-layer simple model with activation function modelled as a matrix multiplication
#W1 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W1 = tf.Variable([[1, 0.5, 0.25],
                  [0.75, 0.5, 0.3],
                  [0.9, 0.3, 0.1]], dtype=tf.float32)
S1 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)
#S1 = tf.Variable(tf.linalg.diag([2., 0., 1.]), dtype=tf.float32)
#W2 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W2 = tf.Variable([[1, 0.5, 0.25],
                  [0.75, 0.5, 0.3],
                  [0.9, 0.3, 0.1]], dtype=tf.float32)
S2 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)
W3 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
S3 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)
X = tf.ones(shape=(7,3))

with tf.GradientTape(persistent=True) as tape:
  Z1 = tf.matmul(X, W1)
  A1 = tf.matmul(Z1, S1)
  Z2 = tf.matmul(A1, W2)
  A2 = tf.matmul(Z2, S2)
  Z3 = tf.matmul(A2, W3)
  A3 = tf.matmul(Z3, S3)

  Y_pred = A3
  loss = tf.reduce_mean(Y_pred)

print(f"Z1: {Z1}")
print(f"Z2: {Z2}")
print(f"Z3: {Z3}")
print("--")
print(f"A1: {A1}")
print(f"A2: {A2}")
print(f"A3: {A3}")
print("--")
print(f"dJ/dA3: expect Y-Y_hat\n{tape.gradient(loss, A3)}")
print(f"dA3/dZ3: {tape.gradient(A3, Z3)}") # expect S1.T         <-- nope, I'm getting ones() instead of diag()
print(f"dZ3/dA2: expect W3.T\n{tape.gradient(Z3, A2)}") # expect W3.T {tick}
print(f"dA2/dZ2: {tape.gradient(A2, Z2)}") # expect S2.T         <-- nope, I'm getting ones() instead of diag()
print(f"dZ2/dA1: expect W2.T\n{tape.gradient(Z2, A1)}") # expect W2.T {tick}
print(f"dA1/dZ1: {tape.gradient(A1, Z1)}") # expect S3.T         <-- nope, I'm getting ones() instead of diag()
print(f"dZ1/dX: expect W1.T\n{tape.gradient(Z1, X)}")  # expect W1.T {tick}
print("--")
print(f"dZ3/dW3: expect ~sum(n)<A2>\n{tape.gradient(Z3, W3)}") # expect sum A2 = [9]*5 = [45] {tick}, ie: [...]*n
print(f"dZ2/dW2: expect ~sum(n)<A1>\n{tape.gradient(Z2, W2)}") # expect sum A1 = [3]*5 = [15] {tick}, ie: [...]*n
print(f"dZ1/dW1: expect ~sum(n)<X>\n{tape.gradient(Z1, W1)}") # expect sum X  = [1]*5 = [5]  {tick},  ie: [...]*n

Z1: [[2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]]
Z2: [[4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]
 [4.21      2.17      1.1175001]]
Z3: [[7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]
 [7.4975004 7.4975004 7.4975004]]
--
A1: [[2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]
 [2.65       1.3        0.65000004]]
A2: [[4.21 

In [None]:
# simulating an actual training loop, with a
# multi-layer simple model with activation function modelled as a matrix multiplication
#W1 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W1 = tf.Variable([[1, 0.5, 0.25],
                  [0.75, 0.5, 0.3],
                  [0.9, 0.3, 0.1]], dtype=tf.float32)
S1 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)
#S1 = tf.Variable(tf.linalg.diag([2., 0., 1.]), dtype=tf.float32)
#W2 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W2 = tf.Variable([[1, 0.5, 0.25],
                  [0.75, 0.5, 0.3],
                  [0.9, 0.3, 0.1]], dtype=tf.float32)
S2 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)
#W3 = tf.Variable(tf.ones(shape=(3,3)), dtype=tf.float32)
W3 = tf.Variable([[1, 0.5, 0.25],
                  [0.75, 0.5, 0.3],
                  [0.9, 0.3, 0.1]], dtype=tf.float32)
S3 = tf.Variable(tf.linalg.diag([1., 1., 1.]), dtype=tf.float32)

X = tf.ones(shape=(7,3))
Y = tf.ones(shape=(7,1))
n = X.shape[0]

with tf.GradientTape(persistent=True) as tape:
  Z1 = tf.matmul(X, W1)
  A1 = tf.matmul(Z1, S1)
  Z2 = tf.matmul(A1, W2)
  A2 = tf.matmul(Z2, S2)
  Z3 = tf.matmul(A2, W3)
  A3 = tf.where(Z3 > 0, Z3, np.zeros_like(Z3)) #tf.matmul(Z3, S3)

  Y_pred = A3
  loss = 1/(2*n) * tf.reduce_sum((Y - Y_pred)**2)  # MSE loss
  loss2 = tf.keras.losses.MSE(Y, Y_pred)     ### BUG FIX: probably need to use tf.keras.losses.MSE()(Y, Y_pred) instead


print(f"n: {n}")
print(f"loss: {loss}, {loss2}")
print("--")
print(f"Y:\n{Y}")
print(f"Y_hat:\n{Y_pred}")
print(f"Error:\n{Y - Y_pred}")
print("--")
print(f"dJ/dA3: expect Y-Y_hat\n{tape.gradient(loss, A3)}")
print(f"dJ/dZ3: expect Y-Y_hat\n{tape.gradient(loss, Z3)}")
print(f"dJ2/dA3: expect Y-Y_hat\n{tape.gradient(loss2, A3)}")
print(f"dJ2/dZ3: expect Y-Y_hat\n{tape.gradient(loss2, Z3)}")
print("--")
expect_W3 = (1/n) * tf.matmul(tf.transpose(A2), Y_pred - Y)  # note: gives wrong or transposed results in any other form
print(f"dJ/dW3: expected:\n{expect_W3}")
print(f"dJ/dW3: expect mean (error*x)\n{tape.gradient(loss, W3)}")
print(f"dJ2/dW3: expect mean (error*x)\n{tape.gradient(loss2, W3)}")


n: 7
loss: 20.592548370361328, [13.728364 13.728364 13.728364 13.728364 13.728364 13.728364 13.728364]
--
Y:
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]
Y_hat:
[[6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.5252502 1.81525  ]
 [6.8432503 3.52525   1.81525  ]]
Error:
[[-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.5252502  -0.81525004]
 [-5.8432503  -2.52525    -0.81525004]]
--
dJ/dA3: expect Y-Y_hat
[[0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075005 0.11646429]
 [0.83475006 0.36075002 0.11646429]]
dJ/dZ3: expect Y-Y_hat
[[0.83475006 0.36075005 0.

In [None]:
(-5.8432503)**2 + (-2.5252502)**2  + (-0.81525004)**2

41.185095268770134

In [None]:
tf.square(Y - Y_pred)

<tf.Tensor: shape=(7, 3), dtype=float32, numpy=
array([[34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768888,  0.6646326],
       [34.143574 ,  6.3768873,  0.6646326]], dtype=float32)>

In [None]:
tf.reduce_mean(tf.square(Y - Y_pred), axis=1)

<tf.Tensor: shape=(7,), dtype=float32, numpy=
array([13.728364, 13.728364, 13.728364, 13.728364, 13.728364, 13.728364,
       13.728364], dtype=float32)>

In [None]:
tf.reduce_sum(tf.square(Y - Y_pred), axis=1)/7

<tf.Tensor: shape=(7,), dtype=float32, numpy=
array([5.8835845, 5.8835845, 5.8835845, 5.8835845, 5.8835845, 5.8835845,
       5.8835845], dtype=float32)>

In [None]:
tf.reduce_sum(tf.square(Y - Y_pred), axis=1)/3

<tf.Tensor: shape=(7,), dtype=float32, numpy=
array([13.728364, 13.728364, 13.728364, 13.728364, 13.728364, 13.728364,
       13.728364], dtype=float32)>