In [1]:
import numpy as np
import random
import ipynb_importer
import mnist_loader

importing Jupyter notebook from mnist_loader.ipynb


In [2]:
# Standard library
import random

# Third-party libraries
import numpy as np

class Network(object):
    def __init__(self, sizes):
        """初始化神经网络
        1. 根据输入，得到神经网络的结构
        2. 根据神经网络的结构使用均值为0，方差为1的高斯分布初始化参数权值w和偏差b。
        输入：
        sizes: list, 表示神经网络各个layer的数目，例如[784, 30, 10]表示3层的神经网络。
                    输入层784个神经元，隐藏层只有1层，有30个神经元，输出层有10个神经元。
        """
        np.random.seed(41)
        random.seed(41)
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    def feed_forward(self, a):
        """Return the output of the network if ``a`` is input."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, alpha, test_data=None):
        """随机梯度下降
        输入：
        training_data：是由tuples ``(x, y)``组成的list，x表示输入，y表示预计输出
        epoches：int, 表示训练整个数据集的次数
        mini_batch_size: int, 在SGD过程中每次迭代使用训练集的数目
        alpha: float, 学习速率
        test_data: 是由tuples ``(x, y)``组成的list，x表示输入，y表示预计输出。
                    如果提供了``test_data``，则每经过一次epoch，都计算并输出当前网络训练结果在测试集上的准确率。
                    虽然可以检测网络训练效果，但是会降低网络训练的速度。
        """
        if test_data:
            n_test = len(test_data)
        m = len(training_data)
        for j in range(epochs):
            np.random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size]
                        for k in range(0, m, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, alpha)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(j, self.evaluate(test_data), n_test))
            else:
                print("Epoch {0} complete".format(j))

    def update_mini_batch(self, mini_batch, alpha):
        """每迭代一次mini_batch，根据梯度下降方法，使用反向传播得到的结果更新权值``w``和偏差``b``
        输入：
        mini_batch: 由tuples ``(x, y)``组成的list
        alpha: int，学习速率
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nable_w = self.back_prop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nable_w)]
        self.weights = [w-(alpha/len(mini_batch))*nw
                    for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(alpha/len(mini_batch))*nb
                    for b, nb in zip(self.biases, nabla_b)]

    def back_prop(self, x, y):
        """反向传播
        1. 前向传播，获得每一层的激活值
        2. 根据输出值计算得到输出层的误差``delta``
        3. 根据``delta``计算输出层C_x对参数``w``, ``b``的偏导
        4. 反向传播得到每一层的误差，并根据误差计算当前层C_x对参数``w``, ``b``的偏导
        输入：
        x: np.ndarray, 单个训练数据
        y: np.ndarray, 训练数据对应的预计输出值
        输出：
        nabla_b: list, C_x对``b``的偏导
        nabla_w: list, C_x对``w``的偏导
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # forward prop
        activation = x
        activations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward prop
        delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        for l in range(2, self.num_layers):
            z = zs[-l];
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta)*sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        """计算准确率，将测试集中的x带入训练后的网络计算得到输出值，
            并得到最终的分类结果，与预期的结果进行比对，最终得到测试集中被正确分类的数目
        输入：
        test_data: 由tuples ``(x, y)``组成的list
        输出：
        int, 测试集中正确分类的数据个数
        """
        test_results = [(np.argmax(self.feed_forward(x)), y) for x, y in test_data]
        return sum(int(x==y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """代价函数对a的偏导
        输入：
        output_activations： np.ndarray, 输出层的激活值，即a^L
        y: np.ndarray, 预计输出值
        输出：
        output_activations-y: list, 偏导值
        """
        return (output_activations-y)

## 激活函数及其导数
def sigmoid(z):
    """The sigmoid function"""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function"""
    return sigmoid(z)*(1-sigmoid(z))

In [3]:
net = Network([2,3,1])
print(net.biases)
print(net.weights)

[array([[-0.27071232],
       [ 0.10484805],
       [ 0.25052782]]), array([[-0.92519997]])]
[array([[ 0.56714366, -1.04018022],
       [-0.15367595,  0.78985181],
       [-1.22621585, -0.94800699]]), array([[-0.56965394, -0.97715021, -0.77063171]])]


In [4]:
%%time
training_data, test_data = mnist_loader.load_data_wrapper("../data")

Wall time: 9.92 s


In [5]:
%%time
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 3, test_data=test_data)

Epoch 0: 9121 / 10000
Epoch 1: 9271 / 10000
Epoch 2: 9317 / 10000
Epoch 3: 9371 / 10000
Epoch 4: 9362 / 10000
Epoch 5: 9395 / 10000
Epoch 6: 9393 / 10000
Epoch 7: 9475 / 10000
Epoch 8: 9473 / 10000
Epoch 9: 9473 / 10000
Epoch 10: 9450 / 10000
Epoch 11: 9466 / 10000
Epoch 12: 9477 / 10000
Epoch 13: 9497 / 10000
Epoch 14: 9475 / 10000
Epoch 15: 9477 / 10000
Epoch 16: 9481 / 10000
Epoch 17: 9483 / 10000
Epoch 18: 9498 / 10000
Epoch 19: 9471 / 10000
Epoch 20: 9488 / 10000
Epoch 21: 9486 / 10000
Epoch 22: 9465 / 10000
Epoch 23: 9461 / 10000
Epoch 24: 9499 / 10000
Epoch 25: 9496 / 10000
Epoch 26: 9501 / 10000
Epoch 27: 9498 / 10000
Epoch 28: 9499 / 10000
Epoch 29: 9506 / 10000
Wall time: 3min 32s


In [6]:
%%time
net = Network([784, 50, 10])
net.SGD(training_data, 30, 10, 3, test_data=test_data)

Epoch 0: 9176 / 10000
Epoch 1: 9307 / 10000
Epoch 2: 9406 / 10000
Epoch 3: 9433 / 10000
Epoch 4: 9476 / 10000
Epoch 5: 9508 / 10000
Epoch 6: 9499 / 10000
Epoch 7: 9502 / 10000
Epoch 8: 9528 / 10000
Epoch 9: 9533 / 10000
Epoch 10: 9569 / 10000
Epoch 11: 9573 / 10000
Epoch 12: 9559 / 10000
Epoch 13: 9592 / 10000
Epoch 14: 9566 / 10000
Epoch 15: 9588 / 10000
Epoch 16: 9575 / 10000
Epoch 17: 9588 / 10000
Epoch 18: 9584 / 10000
Epoch 19: 9587 / 10000
Epoch 20: 9583 / 10000
Epoch 21: 9607 / 10000
Epoch 22: 9589 / 10000
Epoch 23: 9595 / 10000
Epoch 24: 9605 / 10000
Epoch 25: 9600 / 10000
Epoch 26: 9600 / 10000
Epoch 27: 9595 / 10000
Epoch 28: 9592 / 10000
Epoch 29: 9599 / 10000
Wall time: 4min 46s
