In [1]:
#考虑一个用于多分类任务的神经网络，其结构如下：

#输入层： 3 个神经元
#隐藏层： 4 个神经元，使用 tanh 激活函数
#输出层： 4 个神经元，使用 Softmax 激活函数

In [2]:
import numpy as np

In [5]:
#sigmoid激活函数计算
def softmax(a):
  max_value = np.max(a, axis=1, keepdims=True) # 求出每行的最大值
  shifted = a - max_value # 每行减去最大值
  z = np.exp(shifted) # 指数转换

  sums = np.sum(z, axis=1, keepdims=True) #这一行的总和
  return z / sums

In [6]:
#tanh激活函数计算
def tanh(a):
  return np.tanh(a)

In [26]:
#正向传播,求输出函数y_pred
def forward(W1, W2, b1, b2, X):
  Z = np.dot(X, W1.T) + b1.T # shape(2,4)
  H = tanh(Z) # ndarray (2,4)

  Zo = np.dot(H, W2.T) + b2.T # ndarray (2,4)
  y_pred = softmax(Zo) # ndarray (2,4)

  return H, y_pred

In [13]:
#求损失函数loss
def compute_loss(y_pred, y_true):
  loss = -np.sum(y_true * np.log(y_pred + 1e-10)) / y_pred.shape[0]
  return loss

In [30]:
#求反向传播，计算梯度
def backward(y_pred, y_true, H, W2, X):
  y_diff = y_pred - y_true

  dzo = y_diff # shape (2,4)
  b2_grad = np.mean(y_diff, axis=0, keepdims=True).T #shape (4,1)
  W2_grad = np.dot(H.T, dzo) #shape

  dh = np.dot(dzo, W2) # ndarray (2,4)
  dz = 1 - np.tanh(dh)**2 # ndarray (2,4)
  b1_grad = np.mean(dz, axis=0, keepdims=True).T
  W1_grad = np.dot(dz.T, X)

  return W1_grad, W2_grad, b1_grad, b2_grad

In [24]:
#更新参数weight,bias
def update_params(W1, W2, b1, b2, W1_grad, W2_grad, b1_grad, b2_grad, learning_rate):
  W1 -= learning_rate * W1_grad
  W2 -= learning_rate * W2_grad
  b1 -= learning_rate * b1_grad
  b2 -= learning_rate * b2_grad
  return W1, W2, b1, b2

In [32]:
X = np.array([[0.1, 0.2, 0.3],
        [0.4, 0.5, 0.6]])

y_true = np.array([[1, 0, 0, 0],
          [0, 0, 1, 0]])

W1 = np.array([[0.1, 0.2, 0.3],
        [0.4, 0.5, 0.6],
        [0.7, 0.8, 0.9],
        [0.2, 0.3, 0.1]]) #(输入层到隐藏层)

b1 = np.array([[0.1],
        [0.2],
        [0.3],
        [0.1]]) #(隐藏层偏置)

W2 = np.array([[0.5, 0.4, 0.3, 0.2],
        [0.1, 0.6, 0.7, 0.8],
        [0.9, 0.8, 0.7, 0.6],
        [0.5, 0.4, 0.3, 0.2]]) #(隐藏层到输出层)

b2 = np.array([[0.2],
        [0.3],
        [0.1],
        [0.2]]) #(输出层偏置)

learning_rate = 0.03

epochs = 100

for epoch in range(epochs):
  H, y_pred = forward(W1, W2, b1, b2, X)
  loss = compute_loss(y_pred, y_true)
  W1_grad, W2_grad, b1_grad, b2_grad = backward(y_pred, y_true, H, W2, X)
  W1, W2, b1, b2 = update_params(W1, W2, b1, b2, W1_grad, W2_grad, b1_grad, b2_grad, learning_rate)

  print("Epochs:", epoch + 1,
    "loss:", np.round(np.mean(loss), 4),
    "W1_grad:", np.round(W1_grad.flatten(), 2),
    "W1_grad:", np.round(W2_grad.flatten(), 2),
    "W1_grad:", np.round(b1_grad.flatten(), 2),
    "W1_grad:", np.round(b2_grad.flatten(), 2)
    )

Epochs: 1 loss: 1.3271 W1_grad: [0.45 0.64 0.82 0.48 0.68 0.87 0.49 0.68 0.87 0.49 0.68 0.87] W1_grad: [-0.13  0.2  -0.17  0.11 -0.27  0.38 -0.31  0.21 -0.4   0.48 -0.36  0.27
 -0.11  0.18 -0.17  0.1 ] W1_grad: [0.94 0.97 0.96 0.95] W1_grad: [-0.33  0.31 -0.15  0.17]
Epochs: 2 loss: 1.3217 W1_grad: [0.45 0.63 0.82 0.48 0.68 0.87 0.49 0.68 0.87 0.49 0.68 0.87] W1_grad: [-0.1   0.17 -0.16  0.09 -0.24  0.36 -0.32  0.2  -0.37  0.47 -0.37  0.27
 -0.08  0.15 -0.15  0.08] W1_grad: [0.93 0.97 0.96 0.95] W1_grad: [-0.32  0.31 -0.16  0.18]
Epochs: 3 loss: 1.3173 W1_grad: [0.45 0.63 0.82 0.48 0.68 0.87 0.49 0.68 0.87 0.49 0.68 0.87] W1_grad: [-0.08  0.14 -0.14  0.08 -0.21  0.34 -0.32  0.19 -0.34  0.46 -0.38  0.27
 -0.06  0.12 -0.13  0.07] W1_grad: [0.93 0.97 0.96 0.96] W1_grad: [-0.31  0.3  -0.17  0.18]
Epochs: 4 loss: 1.3138 W1_grad: [0.44 0.63 0.81 0.48 0.68 0.87 0.49 0.68 0.87 0.49 0.68 0.87] W1_grad: [-0.05  0.11 -0.12  0.06 -0.18  0.31 -0.32  0.19 -0.31  0.44 -0.39  0.26
 -0.03  0.09 -0.11  