In [2]:
pwd

'/home/massquantity/Workspace/neural-networks-and-deep-learning/mine'

In [1]:
import os, sys
sys.path.append('/home/massquantity/Workspace/neural-networks-and-deep-learning')

In [2]:
import os, sys
sys.path

['',
 '/usr/lib/spark/spark-2.3.2-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip',
 '/usr/lib/spark/spark-2.3.2-bin-hadoop2.7/python',
 '/home/massquantity/Workspace/neural-networks-and-deep-learning/mine',
 '/home/massquantity/.conda/envs/py35/lib/python35.zip',
 '/home/massquantity/.conda/envs/py35/lib/python3.5',
 '/home/massquantity/.conda/envs/py35/lib/python3.5/plat-linux',
 '/home/massquantity/.conda/envs/py35/lib/python3.5/lib-dynload',
 '/home/massquantity/.conda/envs/py35/lib/python3.5/site-packages',
 '/home/massquantity/.conda/envs/py35/lib/python3.5/site-packages/python_recsys-0.2-py3.5.egg',
 '/home/massquantity/.conda/envs/py35/lib/python3.5/site-packages/IPython/extensions',
 '/home/massquantity/.ipython',
 '/home/massquantity/Workspace/neural-networks-and-deep-learning']

In [2]:
from src import mnist_loader
import numpy as np
import random

In [5]:
class Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.weights = [np.random.randn(back_layer, forward_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer, 1) for layer in sizes[1:]]

    def feedward(self, a):
        for b, w in zip(self.biases[:-1], self.weights[:-1]):
            a = Network.relu(np.dot(w, a) + b)  ####
        a = np.dot(self.weights[-1], a) + self.biases[-1]
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        training_data = list(training_data)
        if test_data:
            test_data = list(test_data)

        for j in range(epochs):
            random.shuffle(training_data)
            for k in range(0, len(training_data), mini_batch_size):
                mini_batch = training_data[k : k+mini_batch_size]
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(j+1, self.evaluate(test_data), len(test_data)))
            else:
                print("Epoch {} complete".format(j+1))

    def update_mini_batch(self, mini_batch, eta):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            add_b, add_w = self.backprop(x, y)
            gradient_b = [gb+ab for gb, ab in zip(gradient_b, add_b)]
            gradient_w = [gw+aw for gw, aw in zip(gradient_w, add_w)]

        self.biases = [bias - eta * gb / len(mini_batch) for bias, gb in zip(self.biases, gradient_b)]
        self.weights = [weight - eta * gw / len(mini_batch) for weight, gw in zip(self.weights, gradient_w)]

    def backprop(self, x, y):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]

        ## forward
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, a) + b
            a = Network.relu(z)   #####
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(self.weights[-1], a) + self.biases[-1]  ####
        z_hold.append(final_layer)                                 ####
        a_hold.append(Network.softmax(final_layer))               #####

        ## backward
        delta = self.cost_derivative(a_hold[-1], y) # * self.relu_derivative(z_hold[-1])   ####
        gradient_w[-1] = np.dot(delta, a_hold[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l+1].T, delta) * Network.relu_derivative(z_hold[-l])   ### 
            gradient_w[-l] = np.dot(delta, a_hold[-l-1].T)
            gradient_b[-l] = delta

        return gradient_b, gradient_w

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedward(x)), y) for x, y in test_data]
        return sum(int(x == y) for x, y in test_results)

    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def cost_derivative(x, y):
        return x - y

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout
    
    @staticmethod
    def softmax(z):
        max_val = np.max(z)
        z = z - max_val
        return np.exp(z) / np.sum(np.exp(z))
    
    @staticmethod
    def softmax(x):
        if x.ndim == 2:
            x = x.T
            x = x - np.max(x, axis=0)
            y = np.exp(x) / np.sum(np.exp(x), axis=0)
            return y.T 

        x = x - np.max(x) # 溢出对策
        return np.exp(x) / np.sum(np.exp(x))

In [None]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [7]:
net = Network([784, 30, 10])

In [8]:
net.SGD(training_data, 30, 32, 0.1, test_data=test_data)

Epoch 1: 7745 / 10000
Epoch 2: 7949 / 10000
Epoch 3: 8591 / 10000
Epoch 4: 8321 / 10000
Epoch 5: 8731 / 10000
Epoch 6: 8904 / 10000
Epoch 7: 8560 / 10000
Epoch 8: 8972 / 10000
Epoch 9: 8785 / 10000
Epoch 10: 9074 / 10000
Epoch 11: 9018 / 10000
Epoch 12: 9082 / 10000
Epoch 13: 9117 / 10000
Epoch 14: 9052 / 10000
Epoch 15: 9174 / 10000
Epoch 16: 9191 / 10000
Epoch 17: 9188 / 10000
Epoch 18: 9188 / 10000
Epoch 19: 9175 / 10000
Epoch 20: 9205 / 10000
Epoch 21: 9205 / 10000
Epoch 22: 9284 / 10000
Epoch 23: 9284 / 10000
Epoch 24: 9306 / 10000
Epoch 25: 9324 / 10000
Epoch 26: 9264 / 10000
Epoch 27: 9322 / 10000
Epoch 28: 9315 / 10000
Epoch 29: 9358 / 10000
Epoch 30: 9373 / 10000


In [118]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [119]:
X = []
y = []

for data, target in training_data:
    X.append(data)
    y.append(target)

X = np.array(X).reshape(50000, -1)
y = np.array(y).reshape(50000, -1)

In [120]:
X.shape, y.shape

((50000, 784), (50000, 10))

In [121]:
test_X = []
test_y = []

for data, target in test_data:
    test_X.append(data)
    test_y.append(target)

test_X = np.array(test_X).reshape(10000, -1)
test_y = np.array(test_y).reshape(10000, -1)

In [122]:
test_X.shape, test_y.shape

((10000, 784), (10000, 1))

In [130]:
import random
import numpy as np

class Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.weights = [np.random.randn(forward_layer, back_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer) for layer in sizes[1:]]

    def feedward(self, a):
        for b, w in zip(self.biases[:-1], self.weights[:-1]):
            a = Network.relu(np.dot(a, w) + b)  ####
        a = np.dot(a, self.weights[-1]) + self.biases[-1]
   #     a = Network.softmax(a)  #####
        return a

    def SGD(self, X, y, epochs, mini_batch_size, eta, test_X=None, test_y=None):
    #     training_data = list(training_data)
    #     if test_data:
    #         test_data = list(test_data)

        for j in range(epochs):
            random_mask = np.random.choice(len(X), len(X), replace=False)
            X = X[random_mask]
            y = y[random_mask]
            
            for k in range(0, len(X), mini_batch_size):
                X_batch = X[k : k + mini_batch_size]
                y_batch = y[k : k + mini_batch_size]
                self.update_mini_batch(X_batch, y_batch, eta)
                
       #     print("Tringing Epoch {0}: {1} / {2}".format(j+1, self.evaluate(X, y), len(X)))
            if test_X is not None:
                print("Epoch {0}: {1}".format(j+1, self.evaluate(test_X, test_y)))
            else:
                print("Epoch {} complete".format(j+1))

    def update_mini_batch(self, X_batch, y_batch, eta):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        
        add_b, add_w = self.backprop(X_batch, y_batch)
        gradient_b = [gb+ab for gb, ab in zip(gradient_b, add_b)]
        gradient_w = [gw+aw for gw, aw in zip(gradient_w, add_w)]

        self.biases = [bias - eta * gb / len(X_batch) for bias, gb in zip(self.biases, gradient_b)]
        self.weights = [weight - eta * gw / len(X_batch) for weight, gw in zip(self.weights, gradient_w)]

    def backprop(self, x, y):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]

        ## forward
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(a, w) + b
            a = Network.relu(z)   #####
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(a, self.weights[-1]) + self.biases[-1]  ####
        z_hold.append(final_layer)                                 ####
        a_hold.append(Network.softmax(final_layer))               #####

        ## backward
        delta = self.cost_derivative(a_hold[-1], y) # * self.relu_derivative(z_hold[-1])   ####
        gradient_w[-1] = np.dot(a_hold[-2].T, delta)
        gradient_b[-1] = np.sum(delta, axis=0)   ########

        for l in range(2, self.num_layers):
            delta = np.dot(delta, self.weights[-l+1].T) * Network.relu_derivative(z_hold[-l])   ### 
            gradient_w[-l] = np.dot(a_hold[-l-1].T, delta)
            gradient_b[-l] = np.sum(delta, axis=0)   ##  ###
            
        return gradient_b, gradient_w

    def evaluate(self, test_X, test_y):
        test_results = np.argmax(self.feedward(test_X), axis=1)
        test_y = test_y.reshape(10000)  #####
        accuracy = np.sum(test_results == test_y) / float(test_X.shape[0])
        return accuracy

    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def cost_derivative(x, y):
        return x - y

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout
    '''
    @staticmethod
    def softmax(z):
        max_val = np.max(z)
        z_max = z - max_val
        return np.exp(z) / np.sum(np.exp(z)) '''
    
    @staticmethod
    def softmax(x):
        if x.ndim == 2:
            x = x.T
            x = x - np.max(x, axis=0)
            y = np.exp(x) / np.sum(np.exp(x), axis=0)
            return y.T 

        x = x - np.max(x) # 溢出对策
        return np.exp(x) / np.sum(np.exp(x))

In [139]:
net = Network([784, 30, 10])

In [140]:
net.SGD(X, y, 100, 32, 0.01, test_X, test_y)

Epoch 1: 0.6719
Epoch 2: 0.7094
Epoch 3: 0.7369
Epoch 4: 0.7507
Epoch 5: 0.7777
Epoch 6: 0.7892
Epoch 7: 0.792
Epoch 8: 0.8057
Epoch 9: 0.7782
Epoch 10: 0.8122
Epoch 11: 0.8177
Epoch 12: 0.8184
Epoch 13: 0.8132
Epoch 14: 0.8251
Epoch 15: 0.8248
Epoch 16: 0.8415
Epoch 17: 0.842
Epoch 18: 0.84
Epoch 19: 0.8494
Epoch 20: 0.8507
Epoch 21: 0.8464
Epoch 22: 0.846
Epoch 23: 0.8575
Epoch 24: 0.8623
Epoch 25: 0.8515
Epoch 26: 0.8633
Epoch 27: 0.8625
Epoch 28: 0.8584
Epoch 29: 0.8621
Epoch 30: 0.869
Epoch 31: 0.8737
Epoch 32: 0.863
Epoch 33: 0.8735
Epoch 34: 0.8679
Epoch 35: 0.8772
Epoch 36: 0.8766
Epoch 37: 0.8742
Epoch 38: 0.8739
Epoch 39: 0.8734
Epoch 40: 0.8801
Epoch 41: 0.8808
Epoch 42: 0.8825
Epoch 43: 0.8825
Epoch 44: 0.886
Epoch 45: 0.8817
Epoch 46: 0.8755
Epoch 47: 0.8915
Epoch 48: 0.8882
Epoch 49: 0.8908
Epoch 50: 0.8868
Epoch 51: 0.8897
Epoch 52: 0.8941
Epoch 53: 0.891
Epoch 54: 0.8851
Epoch 55: 0.8938
Epoch 56: 0.8969
Epoch 57: 0.8921
Epoch 58: 0.8934
Epoch 59: 0.8991
Epoch 60: 0.895

In [12]:
# momentum， Adam

class Network:  
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.weights = [np.random.randn(back_layer, forward_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer, 1) for layer in sizes[1:]]
   #     self.velocity_w = [np.zeros(w.shape) for w in self.weights]
   #     self.velocity_b = [np.zeros(b.shape) for b in self.biases]
    
        self.vw = [np.zeros(w.shape) for w in self.weights] 
        self.hw = [np.zeros(w.shape) for w in self.weights]
        self.vb = [np.zeros(b.shape) for b in self.biases]
        self.hb = [np.zeros(b.shape) for b in self.biases]
        self.iter = 0


    def feedward(self, a):
        for b, w in zip(self.biases[:-1], self.weights[:-1]):
            a = Network.relu(np.dot(w, a) + b)  ####
        a = np.dot(self.weights[-1], a) + self.biases[-1]
        return a

    def Adam(self, training_data, epochs, mini_batch_size, eta, test_data=None):  ##### SGD
        training_data = list(training_data)
        if test_data:
            test_data = list(test_data)
        
        for j in range(epochs):
            random.shuffle(training_data)
            for k in range(0, len(training_data), mini_batch_size):
                mini_batch = training_data[k : k+mini_batch_size]
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(j+1, self.evaluate(test_data), len(test_data)))
            else:
                print("Epoch {} complete".format(j+1))

    def update_mini_batch(self, mini_batch, eta, momentum=0.9, rho1=0.9, rho2=0.999):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            add_b, add_w = self.backprop(x, y)
            gradient_b = [gb+ab for gb, ab in zip(gradient_b, add_b)]
            gradient_w = [gw+aw for gw, aw in zip(gradient_w, add_w)]
            
     #   self.velocity_w = [momentum * vw - eta * gw / len(mini_batch) for vw, gw in zip(self.velocity_w, gradient_w)]
     #   self.velocity_b = [momentum * vb - eta * gb / len(mini_batch) for vb, gb in zip(self.velocity_b, gradient_b)]

     #   self.biases = [bias + vb for bias, vb in zip(self.biases, self.velocity_b)]
     #   self.weights = [weight + vw for weight, vw in zip(self.weights, self.velocity_w)]
        
        self.iter += 1
        gradient_w = [gw / len(mini_batch) for gw in gradient_w]
        gradient_b = [gb / len(mini_batch) for gb in gradient_b]
        
        self.vw = [rho1 * vw + (1 - rho1) * gw for vw, gw in zip(self.vw, gradient_w)]
        self.vb = [rho1 * vb + (1 - rho1) * gb for vb, gb in zip(self.vb, gradient_b)]
        self.hw = [rho2 * hw + (1 - rho2) * (gw ** 2) for hw, gw in zip(self.hw, gradient_w)]
        self.hb = [rho2 * hb + (1 - rho2) * (gb ** 2) for hb, gb in zip(self.hb, gradient_b)]
        unbias_vw = [vw / (1 - rho1 ** self.iter) for vw in self.vw]
        unbias_vb = [vb / (1 - rho1 ** self.iter) for vb in self.vb]
        unbias_hw = [hw / (1 - rho2 ** self.iter) for hw in self.hw]
        unbias_hb = [hb / (1 - rho2 ** self.iter) for hb in self.hb]
        self.weights = [weight - eta * vw / (np.sqrt(hw) + 1e-8) for weight, vw, hw in zip(self.weights, unbias_vw, unbias_hw)]
        self.biases = [bias - eta * vb / (np.sqrt(hb) + 1e-8) for bias, vb, hb in zip(self.biases, unbias_vb, unbias_hb)]
        

    def backprop(self, x, y):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]

        ## forward
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, a) + b
            a = Network.relu(z)   #####
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(self.weights[-1], a) + self.biases[-1]  ####
        z_hold.append(final_layer)                                 ####
        a_hold.append(Network.softmax(final_layer))               #####

        ## backward
        delta = self.cost_derivative(a_hold[-1], y) # * self.relu_derivative(z_hold[-1])   ####
        gradient_w[-1] = np.dot(delta, a_hold[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l+1].T, delta) * Network.relu_derivative(z_hold[-l])   ### 
            gradient_w[-l] = np.dot(delta, a_hold[-l-1].T)
            gradient_b[-l] = delta

        return gradient_b, gradient_w

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedward(x)), y) for x, y in test_data]
        return sum(int(x == y) for x, y in test_results)

    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def cost_derivative(x, y):
        return x - y

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout
    
    @staticmethod
    def softmax(z):
        max_val = np.max(z)
        z_max = z - max_val
        return np.exp(z) / np.sum(np.exp(z))

In [16]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [17]:
net = Network([784, 30, 10])

In [18]:
# net.SGD(training_data, 30, 32, 0.1, test_data=test_data)

In [19]:
net.Adam(training_data, 30, 32, 0.001, test_data=test_data)

Epoch 1: 5623 / 10000
Epoch 2: 7180 / 10000
Epoch 3: 7986 / 10000
Epoch 4: 8368 / 10000
Epoch 5: 8641 / 10000
Epoch 6: 8844 / 10000
Epoch 7: 8986 / 10000
Epoch 8: 9080 / 10000
Epoch 9: 9157 / 10000
Epoch 10: 9198 / 10000
Epoch 11: 9247 / 10000
Epoch 12: 9256 / 10000
Epoch 13: 9325 / 10000
Epoch 14: 9346 / 10000
Epoch 15: 9373 / 10000
Epoch 16: 9381 / 10000
Epoch 17: 9417 / 10000
Epoch 18: 9407 / 10000
Epoch 19: 9427 / 10000
Epoch 20: 9453 / 10000
Epoch 21: 9461 / 10000
Epoch 22: 9436 / 10000
Epoch 23: 9455 / 10000
Epoch 24: 9481 / 10000
Epoch 25: 9476 / 10000
Epoch 26: 9487 / 10000
Epoch 27: 9477 / 10000
Epoch 28: 9458 / 10000
Epoch 29: 9486 / 10000
Epoch 30: 9512 / 10000
