In [8]:
#题目描述：

#假设我们有一个简单的三层全连接神经网络，用于分类任务。网络结构如下：

#输入层：3个神经元
#隐藏层1：4个神经元，使用 ReLU 激活函数
#隐藏层2：2个神经元，使用 ReLU 激活函数
#输出层：3个神经元，使用 Softmax 激活函数

In [1]:
import numpy as np

In [7]:
#relu激活函数
def relu(a):
  return np.maximum(0, a)

In [11]:
#softmax激活函数
def softmax(b):
  z = np.exp(b - np.max(b))
  return z / np.sum(z)

In [21]:
#前向传播求输出y_pred
def forward(W1, W2, W3, b1, b2, b3, X):
  Z1 = np.dot(X, W1.T) + b1.T # ndarray(2,4)
  H1 = relu(Z1) #ndarray(2,4)

  Z2 = np.dot(H1, W2.T) + b2 # ndarray(2,2)
  H2 = relu(Z2) #ndarray(2,2)

  Z3 = np.dot(H2, W3.T) + b3.T
  y_pred = softmax(Z3) #ndarray(2,3)

  return Z1, H1, Z2, H2, Z3, y_pred

In [17]:
#求损失loss函数
def compute_loss(y_pred, y_true):
  loss = (y_pred - y_true)**2 / 2 #adarray(2,3)
  return loss

In [32]:
#反向传播，求参数导数
def backward(y_pred, y_true, H1, H2, Z1, Z2, Z3, W2, W3, X):
  y_diff = y_pred - y_true #ndarray (2,3)

  dz3 = y_diff #ndarray(2,3)
  b3_grad = dz3.mean(axis=0, keepdims=True).T #abarray(3,1)
  W3_grad = np.dot(dz3.T, H2) #ndarray(3,2)

  dh2 = np.dot(dz3, W3) #shape(2,2)
  dz2 = dh2 * (Z2>0) #shape(2,2)
  b2_grad = np.mean(dz2, axis=0, keepdims=True).T #shape(2,1)
  W2_grad = np.dot(dz2.T, H1)

  dh1 = np.dot(dz2.T, W2) #shape(2,4)
  dz1 = dh1 * (Z1>0) #shape(2,4)
  b1_grad = np.mean(dz1, axis=0, keepdims=True).T #shape(4,1)
  W1_grad = np.dot(dz1.T, X) #shape(4,3)

  return W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad

In [34]:
#更新参数weight, bias
def update_params(W1, W2, W3, b1, b2, b3, learning_rate):
  W1 -= learning_rate * W1_grad
  W2 -= learning_rate * W2_grad
  W3 -= learning_rate * W3_grad

  b1 -= learning_rate * b1_grad
  b2 -= learning_rate * b2_grad
  b3 -= learning_rate * b3_grad
  return W1, W2, W3, b1, b2, b3

In [40]:
X = np.array([[0.2, 0.4, 0.1],
        [0.7, 0.3, 0.8]])   #ndarray (2, 3)

y_true = np.array([[1, 0, 0],
          [0, 0, 1]])   # ndarray(2, 3)

W1 = np.array([[0.1, 0.2, 0.3],
        [0.4, 0.5, 0.6],
        [0.7, 0.8, 0.9],    # ndarray(4, 3)
        [0.2, 0.3, 0.1]])   #（输入层到隐藏层1）

b1 = np.array([[0.1],
        [0.2],
        [0.3],  # ndarray(4, 1)
        [0.1]]) #（隐藏层1 偏置）

W2 = np.array([[0.5, 0.4, 0.3, 0.2], # ndarray(2, 4)
        [0.1, 0.6, 0.7, 0.8]]) #（隐藏层1 到隐藏层2）

b2 = np.array([[0.2],  # ndarray(2, 1)
        [0.3]])  #（隐藏层2 偏置）

W3 = np.array([[0.9, 0.8],
        [0.7, 0.6],   # ndarray(3, 2)
        [0.5, 0.4]])   #（隐藏层2 到输出层）

b3 = np.array([[0.1],
        [0.2],  # ndarray(3, 1)
        [0.3]]) #（输出层偏置）

learning_rate = 0.02

epochs = 40

for epoch in range(epochs):
  Z1, H1, Z2, H2, Z3, y_pred = forward(W1, W2, W3, b1, b2, b3, X)
  loss = compute_loss(y_pred, y_true)

  W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad = backward(y_pred, y_true, H1, H2, Z1, Z2, Z3, W2, W3, X)
  W1, W2, W3, b1, b2, b3 = update_params(W1, W2, W3, b1, b2, b3, learning_rate)

  print("epoch:", epoch + 1,
    "loss", np.round(loss.mean(), 4),
    "W1_grad", np.round(W1_grad.flatten(), 2),
    "W2_grad", np.round(W2_grad.flatten(), 2),
    "W3_grad", np.round(W3_grad.flatten(), 2),
    "b1_grad", np.round(b1_grad.flatten(), 2),
    "b2_grad", np.round(b2_grad.flatten(), 2),
    "b3_grad", np.round(b3_grad.flatten(), 2)
  )

epoch: 1 loss 0.158 W1_grad [-0.3  -0.24 -0.29 -0.17 -0.14 -0.16 -0.09 -0.08 -0.09 -0.02 -0.02 -0.02] W2_grad [-0.11 -0.26 -0.41 -0.15 -0.08 -0.19 -0.31 -0.12] W3_grad [-0.02 -0.04  0.41  0.66 -1.38 -2.23] b1_grad [-0.34 -0.2  -0.11 -0.03] b2_grad [-0.31 -0.26] b3_grad [-0.22  0.14 -0.42]
epoch: 2 loss 0.1556 W1_grad [-0.3  -0.25 -0.3  -0.19 -0.16 -0.19 -0.12 -0.1  -0.12 -0.05 -0.04 -0.05] W2_grad [-0.13 -0.29 -0.46 -0.16 -0.11 -0.25 -0.38 -0.14] W3_grad [-0.03 -0.04  0.39  0.63 -1.38 -2.21] b1_grad [-0.35 -0.22 -0.14 -0.06] b2_grad [-0.32 -0.28] b3_grad [-0.22  0.14 -0.41]
epoch: 3 loss 0.153 W1_grad [-0.31 -0.25 -0.3  -0.22 -0.18 -0.21 -0.15 -0.12 -0.15 -0.08 -0.06 -0.08] W2_grad [-0.15 -0.33 -0.51 -0.17 -0.13 -0.3  -0.46 -0.16] W3_grad [-0.03 -0.04  0.38  0.59 -1.39 -2.2 ] b1_grad [-0.35 -0.25 -0.17 -0.09] b2_grad [-0.34 -0.3 ] b3_grad [-0.22  0.13 -0.4 ]
epoch: 4 loss 0.1501 W1_grad [-0.31 -0.25 -0.31 -0.24 -0.19 -0.24 -0.18 -0.14 -0.18 -0.1  -0.08 -0.1 ] W2_grad [-0.17 -0.36 -0.56