为了让每部分实验更加独立，故将初始化、正则化、梯度校验三部分进行划分独立

### 软件包

In [1]:
# -*- coding: utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import scipy.io as sio
from sympy.abc import theta

## 初始化 init_utils

In [2]:
# 用类的方法来写
class init_utils:
    def __init__(self, learning_rate=0.01):
        self.parameters = {}
        self.learning_rate = learning_rate

    # 计算 sigmoid 激活函数
    @staticmethod
    def sigmoid(z):
        z = np.clip(z, -500, 500) # 防止溢出，避免计算出现 inf 或 nan
        s = 1 / (1 + np.exp(-z))
        return s

    # 计算 relu 激活函数
    @staticmethod
    def relu(z):
        s = np.maximum(0, z) # np.maximum() 逐元素比较 0 和 z 中的每个元素，返回它们中较大的那个。
        return s

    # 计算交叉熵损失
    @staticmethod
    def comput_loss(Y_hat, Y): # Y_hat: 预测值(模型最后一层的输出)，Y: 真实值
        m = Y.shape[1]
        logprobs = np.multiply(-np.log(Y_hat), Y) + np.multiply(-np.log(1 - Y_hat), 1 - Y)
        loss = 1. / m * np.sum(logprobs)
        return loss

    # 实现三层神经网络的前向传播流程
    def forward_propagation(self, X):
        W1, b1 = self.parameters['W1'], self.parameters['b1']
        W2, b2 = self.parameters['W2'], self.parameters['b2']
        W3, b3 = self.parameters['W3'], self.parameters['b3']

        z1 = np.dot(W1, X) + b1
        a1 = self.relu(z1)
        z2 = np.dot(W2, a1) + b2
        a2 = self.relu(z2)
        z3 = np.dot(W3, a2) + b3
        a3 = self.sigmoid(z3)

        cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
        return a3, cache

    # 实现三层神经网络的反向传播流程
    @staticmethod
    def backward_propagation(X, Y, cache):
        (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
        m = X.shape[1]
        dz3 = a3 - Y
        dW3 = np.dot(dz3, a2.T)
        db3 = np.sum(dz3, axis=1, keepdims=True)

        da2 = np.dot(W3.T, dz3)
        dz2 = np.multiply(da2, np.int64(a2 > 0))
        dW2 = np.dot(dz2, a1.T)
        db2 = np.sum(dz2, axis=1, keepdims=True)

        da1 = np.dot(W2.T, dz2)
        dz1 = np.multiply(da1, np.int64(a1 > 0))
        dW1 = np.dot(dz1, X.T)
        db1 = np.sum(dz1, axis=1, keepdims=True)

        gradients = {
            "dZ3": dz3, "dW3": dW3, "db3": db3,
            "da2": da2, "dZ2": dz2, "dW2": dW2, "db2": db2,
            "da1": da1, "dZ1": dz1, "dW1": dW1, "db1": db1
        }

        return gradients

    # 参数更新（梯度下降）
    def update_parameters(self, gradients, learning_rate):
        L = len(self.parameters) // 2
        for l in range(L):
            self.parameters["W" + str(l + 1)] = self.parameters["W" + str(l + 1)] - learning_rate * gradients["dW" + str(l + 1)]
            self.parameters["b" + str(l + 1)] = self.parameters["b" + str(l + 1)] - learning_rate * gradients["db" + str(l + 1)]

    # 进行预测并计算准确率
    def predict(self, X, y):
        m = X.shape[1]
        p = np.zeros((1, m), dtype=np.int)

        # 前向传播
        Y_hat, caches = self.forward_propagation(X)

        # 第一种写法
        for i in range(Y_hat.shape[1]):
            if Y_hat[0, i] > 0.5:
                p[0, i] = 1
            else:
                p[0, i] = 0

        # 打印预测结果
        print("Accuracy: " + str(np.mean((p[0, :] == y[0, :]))))

        return p

        # 第二种写法
        # p[0, :] = (Y_hat > 0.5).astype(int)
        # print("Accuracy:", np.mean(p == y))
        # return p

    # 生成一个带噪声的“圆形”数据集, 这是一个二分类任务，用于验证神经网络的非线性分类能力
    @staticmethod
    def load_dataset(is_plot=True):
        np.random.seed(1) # 设置随机种子为 1，保证每次生成的数据集都是相同的。用于训练集
        train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05)
        """
        使用 sklearn 中的 make_circles 函数生成一个二分类的同心圆形数据集（即两类数据分布在两个圆上），
            train_X：维度是 (300, 2)，即 300 个样本，每个样本是二维点 (x1, x2)
            train_Y：维度是 (300,)，每个样本的标签（0 或 1）
        """
        np.random.seed(2) # 重新设置随机种子为 2，以生成 不同于训练集 的测试集。
        test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05)

        # 是否需要绘制
        if is_plot:
            plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral)

        train_X = train_X.T
        train_Y = train_Y.reshape((1, train_Y.shape[0]))
        test_X = test_X.T
        test_Y = test_Y.reshape((1, test_Y.shape[0]))

        return train_X, train_Y, test_X, test_Y

    # 可视化分类边界
    @staticmethod
    def plot_decision_boundary(model, X, y):
        # 设置边界
        x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
        y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
        h = 0.01

        # 生成网格
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

        # 预测整个网格的函数值
        Z = model(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        # 绘制图像
        plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
        plt.ylabel('x2')
        plt.xlabel('x1')
        plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
        plt.show()

    # 辅助绘图函数，预测每个点属于哪一类（用于画边界） 返回：0/1 的布尔数组
    def predict_dec(self, X):
        # 使用前向传播计算最终输出
        Y_hat, cache = self.forward_propagation(X.T) # 这里用了X.T
        predictions = (Y_hat > 0.5)
        return predictions

        """
            这个函数是为了配合 plot_decision_boundary() 函数使用的，它不会打印准确率，也不返回概率，而是直接将预测值进行 0/1 分类。
            在绘制决策边界时，每个网格点坐标都会作为 X 传进来，predict_dec 会判断它属于哪一类（0 或 1），从而用颜色进行区分。
        """

# init_utils.load_dataset(is_plot=True)

### 正则化 reg_utils

In [3]:
class reg_utils:
    def __init__(self, parameters, learning_rate=0.01, layers_dims = [2, 3, 2, 1]):
        self.learning_rate = learning_rate
        self.layers_dims = layers_dims
        self.parameters = {}

    @staticmethod
    def sigmoid(z):
        z = np.clip(z, -500, 500)
        s = 1 / (1 + np.exp(-z))
        return s

    @staticmethod
    def relu(z):
        s = np.maximum(0, z)
        return s

    def initialize_parameters(self):
        np.random.seed(3)
        L = len(self.layers_dims) // 2

        for l in range(1, L):
            self.parameters["W" + str(l)] = np.random.randn(self.layers_dims[l], self.layers_dims[l - 1]) / np.sqrt(self.layers_dims[l - 1])
            self.parameters["b" + str(l)] = np.zeros((self.layers_dims[l], 1))

            # 断言
            assert(self.parameters["W" + str(l)].shape == (self.layers_dims[l], self.layers_dims[l - 1]))
            assert(self.parameters["b" + str(l)].shape == (self.layers_dims[l], 1))

        return self.parameters

    def forward_propagation(self, X):
        W1 = self.parameters["W1"]
        b1 = self.parameters["b1"]
        W2 = self.parameters["W2"]
        b2 = self.parameters["b2"]
        W3 = self.parameters["W3"]
        b3 = self.parameters["b3"]

        z1 = np.dot(W1, X) + b1
        a1 = self.tanh(z1)
        z2 = np.dot(W2, a1) + b2
        a2 = self.tanh(z2)
        z3 = np.dot(W3, a2) + b3
        a3 = self.sigmoid(z3)

        cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)

        return a3, cache

    @staticmethod
    def computer_cost(Y_hat, Y):
        m = Y.shape[1]
        logprobs = np.multiply(np.log(Y_hat), Y) + np.multiply(np.log(1 - Y_hat), (1 - Y))
        cost = - np.sum(logprobs) / m

        return cost

    @staticmethod
    def backward_propagation(X, Y, cache):
        (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
        m = X.shape[1]
        dz3 = a3 - Y
        dW3 = np.dot(dz3, a2.T)
        db3 = np.sum(dz3, axis=1, keepdims=True)

        da2 = np.dot(W3.T, dz3)
        dz2 = np.multiply(da2, np.int64(a2 > 0))
        dW2 = np.dot(dz2, a1.T)
        db2 = np.sum(dz2, axis=1, keepdims=True)

        da1 = np.dot(W2.T, dz2)
        dz1 = np.multiply(da1, np.int64(a1 > 0))
        dW1 = np.dot(dz1, X.T)
        db1 = np.sum(dz1, axis=1, keepdims=True)

        gradients = {
            "dZ3": dz3, "dW3": dW3, "db3": db3,
            "da2": da2, "dZ2": dz2, "dW2": dW2, "db2": db2,
            "da1": da1, "dZ1": dz1, "dW1": dW1, "db1": db1
        }

        return gradients

    def update_parameters(self, gradients, learning_rate):
        L = len(self.parameters) // 2
        for l in range(L):
            self.parameters["W" + str(l + 1)] = self.parameters["W" + str(l + 1)] - learning_rate * gradients["dW" + str(l + 1)]
            self.parameters["b" + str(l + 1)] = self.parameters["b" + str(l + 1)] - learning_rate * gradients["db" + str(l + 1)]

    # 加载数据集
    @staticmethod
    def load_2D_dataset(is_plot=False):
        data = sio.loadmat('datasets/data.mat')
        train_X = data['X'].T
        train_Y = data['y'].T
        test_X = data['Xval'].T
        test_Y = data['yval'].T
        if is_plot:
            plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral)
        return train_X, train_Y, test_X, test_Y

    def predict(self, X, y):
        m = X.shape[1]
        p = np.zeros((1, m), dtype=np.int)

        # 前向传播
        Y_hat, caches = self.forward_propagation(X)

        # 第一种写法
        for i in range(Y_hat.shape[1]):
            if Y_hat[0, i] > 0.5:
                p[0, i] = 1
            else:
                p[0, i] = 0

        # 打印预测结果
        print("Accuracy: " + str(np.mean((p[0, :] == y[0, :]))))

        return p

        # 第二种写法
        # p[0, :] = (Y_hat > 0.5).astype(int)
        # print("Accuracy:", np.mean(p == y))
        # return p

    @staticmethod
    def plot_decision_boundary(model, X, y):
        # 设置边界
        x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
        y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
        h = 0.01

        # 生成网格
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

        # 预测整个网格的函数值
        Z = model(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        # 绘制图像
        plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
        plt.ylabel('x2')
        plt.xlabel('x1')
        plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
        plt.show()

    # 辅助绘图函数，预测每个点属于哪一类（用于画边界） 返回：0/1 的布尔数组
    def predict_dec(self, X):
        # 使用前向传播计算最终输出
        Y_hat, cache = self.forward_propagation(X.T) # 这里用了X.T
        predictions = (Y_hat > 0.5)
        return predictions

# reg_utils.load_2D_dataset(is_plot=True)


### 梯度检验 gc_utils

In [38]:
class gc_utils:
    def __init__(self, parameters):
        self.parameters = parameters

    @staticmethod
    def sigmoid(z):
        z = np.clip(z, -500, 500)
        s = 1 / (1 + np.exp(-z))
        return s

    @staticmethod
    def relu(z):
        s = np.maximum(0, z)
        return s

    # 字典与向量转换函数
    # 计算“数值梯度”时要对整个参数向量进行扰动，这个函数完成“展平”。
    def dictionary_to_vector(self):
        """
        你有一个三层的神经网络：
            第 1 层：输入 4 个神经元 → 输出 5 个神经元，所以：
                W1：shape = (5, 4)
                b1：shape = (5, 1)
            第 2 层：5 → 3
                W2：shape = (3, 5)
                b2：shape = (3, 1)
            第 3 层：3 → 1
                W3：shape = (1, 3)
                b3：shape = (1, 1)
            所以这个神经网络总参数量 = 5×4 + 5 + 3×5 + 3 + 1×3 + 1 = 47 个元素。
        """
        # 将参数字典（每层的权重和偏置）拉平成一个大向量，便于数值梯度计算时统一处理。
        keys = [] # 保存参数来源的键（例如：W1、b1），方便调试。
        count = 0
        for key in ['W1', 'b1', 'W2', 'b2', 'W3', 'b3']:
            new_vector = np.reshape(self.parameters[key], (-1, 1))
            keys = keys + [key] * new_vector.shape[0]

            if count == 0:
                theta = new_vector # 如果是第一个参数，就直接赋值给 theta，因为这时候还没有已有的列向量可以拼接。
            else:
                theta = np.concatenate((theta, new_vector), axis=0) # 把当前这个参数 new_vector 沿着第0轴（也就是“竖着”）拼接到原有的 theta 后面，形成一个更长的列向量。
            count += 1

        return theta, keys

    def vector_to_dictionary(self, theta):

        """
            作用：把拉平（flatten）成向量的参数 theta，重新还原成一个结构化的字典形式，包含各层的权重（W）和偏置（b）。
            梯度检验过程中，将数值梯度 theta 还原成可用于前向/反向传播的参数形式。

        :param theta:
        :return:
        """
        self.parameters = {}  # 重新构造字典
        self.parameters["W1"] = theta[:20].reshape((5, 4))
        self.parameters["b1"] = theta[20:25].reshape((5, 1))
        self.parameters["W2"] = theta[25:40].reshape((3, 5))
        self.parameters["b2"] = theta[40:43].reshape((3, 1))
        self.parameters["W3"] = theta[43:46].reshape((1, 3))
        self.parameters["b3"] = theta[46:47].reshape((1, 1))

        return self.parameters

    # def test_dictionary_to_vector(self):
    #     theta, keys = self.dictionary_to_vector()
    #     print("✅ 测试 dictionary_to_vector()")
    #     print("theta shape:", theta.shape)
    #     print("theta:\n", theta)
    #     print("\nkeys length:", len(keys))
    #     print("keys:\n", keys)
    #     return theta  # 注意：我们返回 theta 以供下一步测试用
    #
    # def test_vector_to_dictionary(self, theta):
    #     parameters = self.vector_to_dictionary(theta)
    #     print("\n✅ 测试 vector_to_dictionary()")
    #     for key in parameters:
    #         print(f"{key}: shape = {parameters[key].shape}")
    #         print(parameters[key], "\n")

    def gradients_to_vector(self, gradients):
        """
        将梯度字典（每层的权重和偏置的导数）展开成一个列向量，用于梯度检验。
        """
        count = 0
        for key in ['dW1', 'db1', 'dW2', 'db2', 'dW3', 'db3']:
            new_vector = np.reshape(gradients[key], (-1, 1))

            if count == 0:
                theta = new_vector
            else:
                theta = np.concatenate((theta, new_vector), axis=0)
            count = count + 1
        return theta

    # def test_gradients_to_verctor(self, gradients):
    #     theta = self.gradients_to_vector(gradients)
    #     print("✅ 测试 gradients_to_vector()")
    #     print("theta shape:", theta.shape)
    #     print("theta:\n", theta)


# # 测试
# parameters = {
#     "W1": np.random.randn(5, 4),
#     "b1": np.random.randn(5, 1),
#     "W2": np.random.randn(3, 5),
#     "b2": np.random.randn(3, 1),
#     "W3": np.random.randn(1, 3),
#     "b3": np.random.randn(1, 1)
# }
# gradients = {
#     "dW1": np.random.randn(5, 4),
#     "db1": np.random.randn(5, 1),
#     "dW2": np.random.randn(3, 5),
#     "db2": np.random.randn(3, 1),
#     "dW3": np.random.randn(1, 3),
#     "db3": np.random.randn(1, 1)
# }
# 
# # 实例化对象
# utils = gc_utils(parameters)

# 执行测试
# theta = utils.test_dictionary_to_vector()
# utils.test_vector_to_dictionary(theta)
# utils.test_gradients_to_verctor(gradients)



✅ 测试 gradients_to_vector()
theta shape: (47, 1)
theta:
 [[ 0.26420726]
 [-1.38781385]
 [-0.34017428]
 [-2.77613197]
 [-0.94402699]
 [ 0.46871342]
 [-0.57081029]
 [-0.28589654]
 [ 0.66540024]
 [-0.30886063]
 [-0.2374728 ]
 [ 1.54074145]
 [-1.05898817]
 [-0.56024939]
 [ 0.34294975]
 [-0.02619994]
 [-0.37552326]
 [ 0.41785468]
 [ 0.07723412]
 [-0.17817048]
 [ 1.24579233]
 [ 0.32770042]
 [-0.61012564]
 [-0.79818856]
 [ 0.77278608]
 [ 1.25692487]
 [ 2.21832917]
 [-0.33087184]
 [ 1.86246082]
 [-0.17362849]
 [-0.62221911]
 [-0.0276552 ]
 [-1.2857421 ]
 [-1.09168258]
 [-1.62613926]
 [-0.3150761 ]
 [ 0.07025188]
 [ 0.00905691]
 [ 0.22080611]
 [-0.08227922]
 [ 0.09129198]
 [ 0.06030096]
 [ 0.33770794]
 [ 0.28489654]
 [-0.19556954]
 [-0.84500725]
 [ 0.81312262]]
