In [26]:
import numpy as np

## 初始化神经网络参数(init_layers)

In [27]:
nn_architecture = [
    {'input_dim':2, 'output_dim':4, 'activation':'relu'},
    {'input_dim':4, 'output_dim':6, 'activation':'relu'},
    {'input_dim':6, 'output_dim':6, 'activation':'relu'},
    {'input_dim':6, 'output_dim':4, 'activation':'relu'},
    {'input_dim':4, 'output_dim':1, 'activation':'relu'},
]

In [28]:
def init_layers(nn_architecture, seed=42):
    np.random.seed(seed)
    num_of_layers = len(nn_architecture)
    parmas_values = {}
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx =idx + 1
        layer_input_size = layer['input_dim']
        layer_output_size = layer['output_dim']
        
        # 设置神经元的weight和bias
        parmas_values['W' + str(layer_idx)] = np.random.randn(layer_output_size, layer_input_size) * 0.1
        parmas_values['b' + str(layer_idx)] = np.random.randn(layer_output_size, 1) * 0.1
        
    return parmas_values

In [29]:
init_layers(nn_architecture) # 初始化网络权重参数

{'W1': array([[ 0.04967142, -0.01382643],
        [ 0.06476885,  0.15230299],
        [-0.02341534, -0.0234137 ],
        [ 0.15792128,  0.07674347]]),
 'b1': array([[-0.04694744],
        [ 0.054256  ],
        [-0.04634177],
        [-0.04657298]]),
 'W2': array([[ 0.02419623, -0.19132802, -0.17249178, -0.05622875],
        [-0.10128311,  0.03142473, -0.09080241, -0.14123037],
        [ 0.14656488, -0.02257763,  0.00675282, -0.14247482],
        [-0.05443827,  0.01109226, -0.11509936,  0.0375698 ],
        [-0.06006387, -0.02916937, -0.06017066,  0.18522782],
        [-0.00134972, -0.10577109,  0.08225449, -0.12208436]]),
 'b2': array([[ 0.02088636],
        [-0.19596701],
        [-0.1328186 ],
        [ 0.01968612],
        [ 0.07384666],
        [ 0.01713683]]),
 'W3': array([[-0.01156483, -0.03011037, -0.1478522 , -0.07198442, -0.04606388,
          0.10571222],
        [ 0.03436183, -0.17630402,  0.0324084 , -0.03850823, -0.0676922 ,
          0.06116763],
        [ 0.10309995, 

## 设置激活函数(sigmoid and sigmoid_backward)

In [30]:
# 正是激活函数让神经网络具有非线性的计算能力，下面我们将实现两个函数 Relu和Sigmoid

def sigmoid(Z):
    return 1/(1+np.exp(Z))

def relu(Z):
    return np.maximum(0, Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig) # sigmoid 函数进行求导

def relu_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

maximum 的用法：
```

>> np.maximum([-2, -1, 0, 1, 2], 0)

array([0, 0, 0, 1, 2])

```
逐位比较，并取最大值

## Forward Propagation(single and full layer forward propagation)

In [31]:
def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation='relu'):
    '''
    A_prev:输入的数据
    W_curr:当前网络层的权重
    b_curr:当前网络层的偏置
    '''
    Z_curr = np.dot(W_curr, A_prev) + b_curr # 单层的神经网络传播，输入数据乘以权重再加偏置
    
    if activation == 'relu':
        activation_func = relu
    elif activation == 'sigmoid':
        activation_func = sigmoid
    
    return activation_func(Z_curr), Z_curr # 返回输入激活函数的输出，和不经过激活函数的输出

**Note: 这里为什么要返回经过激活函数的Z_curr以及没经过激活函数的Z_curr呢？因为后面反向传播要用**

In [32]:
# 利用单层的前向传播构建完整的前向传播

def full_forward_propagation(X, params_values, nn_architecture):
    '''
    X:输入的数据
    params_values:神经网络权重
    nn_architecture:神经网络结构
    '''
    memory = {} # 存储每一层的输出，包括输入，有经过激活函数的，也有没经过激活函数的
    A_curr = X # 输入的数据
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        A_prev = A_curr
        
        activ_fuction_curr = layer['activation']
        
        W_curr = params_values['W' + str(layer_idx)]
        b_curr = params_values['b' + str(layer_idx)]
        # 经过单层的前向传播后，A_curr就变成了下一层的输入，经过激活函数，Z_curr是没有经过激活函数的
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_fuction_curr)
        
        memory['A' + str(idx)] = A_prev # 经过activation激活函数的网络
        memory['Z' + str(layer_idx)] = Z_curr # 没有经过激活函数的网络
        
    return A_curr, memory # 返回最后一层，和所有层的网络参数存储

## 计算损失函数

In [33]:
# 由于是接下来处理分类问题，所以使用交叉熵函数

def get_cost_value(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = -1/m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T)) # 使用交叉熵计算损失函数
    return np.squeeze(cost)

def get_accuracy_value(Y_hat, Y):
    Y_hat = convert_prob_into_class(Y_hat)
    return (Y_hat == Y).all(axis=0).mean()

使用交叉熵函数公式：
$$
L(\widehat{y}, y)=-(ylog\widehat{y}+(1-y)log(1-\widehat{y}))
$$
这是2分类的交叉熵函数

## Backward Propagation(single and full layer backward propagation)

In [21]:
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation='relu'):
    m = A_prev.shape[1]
    
    if activation == 'relu':
        backward_activation_func = relu_backward
    
    elif activation == 'sigmoid':
        backward_activation_func = sigmoid_backward
        
    dZ_curr = backward_activation_func(dA_curr, Z_curr) # 标量对激活函数的求导
    dW_curr = np.dot(dZ_curr, A_prev.T) / m # 标量对没经过激活函数的求导
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m # 标量对向量的求导，得到偏置的导数
    dA_prev = np.dot(W_curr.T, dZ_curr) # 
    
    return dA_prev, dW_curr, db_curr

keepdim的作用：

假如原先是二维矩阵，那么sum之后，得到的还是一个二维矩阵。如果keepdim是False，那么得到的就是一个标量

矩阵的求导：https://zhuanlan.zhihu.com/p/24709748

In [22]:
def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    m = Y.shape[1]
    Y = Y.reshape(Y_hat.shape)
    
    dA_prev = -(np.divide(Y, Y_hat) - np.divide(1-Y, 1-Y_hat))
    
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        layer_idx_curr = layer_idx_prev + 1
        activ_function_curr = layer['activation']
        
        dA_curr = dA_prev
        
        A_prev = memory['A' + str(layer_idx_prev)]
        Z_curr = memory['Z' + str(layer_idx_curr)]
        W_curr = params_values['W' + str(layer_idx_curr)]
        b_curr = params_values['b' + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
        dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_funtion_curr)
        
        grads_values['dW' + str(layer_idx_curr)] = dW_curr
        grads_values['db' + str(layer_idx_curr)] = db_curr
        
    return grads_values

## 更新权重参数(parmas update)

In [23]:
def update(parmas_values, grads_values, nn_architecture, learning_rate):
    for layer_idx, layer in enumerate(nn_architecture):
        parmas_values['W' + str(layer_idx)] -= learning_rate * grads_values['dW' + str(layer_idx)]
        parmas_values['b' + str(layer_idx)] -= learning_rate * grads_values['db' + str(layer_idx)]
    return parms_values

## Putting things together(train)

In [24]:
def train(X, Y, nn_architecture, epochs, learning_rate):
    params_values = init_layers(nn_architecture, seed=2) # 初始化网络权重，以字典的形式存储
    cost_history = []
    accuracy_history = []
    
    for i in range(epochs):
        # Y_hat是预测值， cashe是每一层的输出，以字典的形式存储
        Y_hat, cashe = full_forward_propagation(X, params_values, nn_architecture)
        cost = get_accuracy_value(Y_hat, Y)# Y_hat是预测值，Y是真实标签
        cost_history.append(accuracy)
        
        grads_values = full_backward_propagation(Y_hat, Y, cashe, params_values, nn_architecture)
        params_values = update(params_values, grads_values, nn_architecture, learning_rate)
    
    return params_values, cost_history, accuracy_history