I learn best with toy code that I can play with. I believe that only practice can help you convert theory into your own knowledge.

This tutorial teaches backpropagation via a very simple toy example, a short python implementation.

This implementation is based on apache spark, train datasets are splited and stored in different nodes (via spark), network training (back propagation) is carried out via distributed evalution.

results show that this spark based implementation is much faster (four cores were assigned for my virtual machine) than my previous one in https://github.com/lzcbetter/step-by-step-neural-network

tests were carried out on a virtual machine (provided in online course https://courses.edx.org/courses/BerkeleyX/CS190.1x/1T2015/info) on Macbook Pro, i7 CPU. four cores and 3GB RAM were assigned to the virtual machine

In [37]:
# Import necessary modules, numpy will be used for matrix and array with better performance

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import random
import math
from operator import add

In [3]:
def load_training_set():
    X = np.load("X.npy")
    y = np.load("y.npy")
    dataset = []
    for i in range(y.shape[0]):
        dataset.append((X[i], y[i]))
        
    return dataset

In [4]:
# spark context (sc) was created in another place, which comes with the virtual machine os image
def parse_data():
    dataset = load_training_set()
    data_set = sc.parallelize(dataset)
    
    #split it into training, validation and test sets. Use the randomSplit method
    weights = [.8, .1, .1]
    seed = 42
    train_set, validate_set, test_set = data_set.randomSplit(weights, seed)
    
    return (train_set, validate_set, test_set)

In [16]:
train_set, validate_set, test_set = parse_data()
print train_set.count(), validate_set.count(), test_set.count(), test_set.collect()[0][0][0];

4012 519 469 0.0


In [24]:
# derivative of our sigmoid function, in terms of the output (i.e. y)
def dsigmoid(y):
    gz = sigmoid(y)
    return gz * (1.0 - gz)

def sigmoid(x):
    # exp function provided by numpy can support vector operation by default
    return 1.0 / (1.0 + np.exp(-x)) 


In [31]:
class NN:
    def __init__(self, ni, nh, no):
        # number of input, hidden, and output nodes
        self.n1 = ni + 1 # +1 for bias node
        self.n2 = nh
        self.n3 = no
        # create weights variables (the theta in model)
        self.w1 = self.weights_init(ni, nh)
        self.w2 = self.weights_init(nh, no)

        # to accumulate the gradient from all the train samples   
        self.Delta1 = np.zeros(shape=(self.n2, self.n1))  # for w1
        self.Delta2 = np.zeros(shape=(self.n3, self.n2+1))# for w2
        
    def weights_init(self, l_in, l_out):
            eps_init = 0.12
            ret = np.random.rand(l_out, 1+l_in) * 2 * eps_init - eps_init
            return ret
        

In [32]:
def ann_train_eval(w, sample):
    w1, w2 = w
    x = sample[0].reshape((-1, 1))
    y = np.eye(10)[sample[1]-1].reshape((-1, 1))
    a1 = np.vstack(([1.0], x))

    # hidden activations
    z2 = np.dot(w1, a1)
    a2 = np.vstack(([1.0], sigmoid(z2)))

    # output activations
    z3 = np.dot(w2, a2)
    a3 = sigmoid(z3)
    
    ## start back propogation
    # calculate error terms for output
    delta3 = a3 - y
    # do not forget to skip the first column which is for bias and should not be included
    delta2 = (np.dot(w2.T, delta3))[1:] * dsigmoid(z2)
    
    # gradient from all the train samples for accumulating
    Delta1 = np.dot(delta2, a1.T)
    Delta2 = np.dot(delta3, a2.T)

    # calculate error, mse was used, just to show the minimization, 
    # if use other optimize method, should use cost function 
    
    return (Delta1, Delta2, abs(a3[sample[1]-1] - 1.0))
    

In [56]:
def nn_predict(ann, inputs):
    x = inputs.reshape((-1, 1))
    a1 = np.vstack(([1.0], x))

    # hidden activations
    z2 = np.dot(ann.w1, a1)
    a2 = np.vstack(([1.0], sigmoid(z2)))

    # output activations
    z3 = np.dot(ann.w2, a2)
    a3 = sigmoid(z3)
    return a3

weights update formula (gradient descent) $\frac { \partial  }{ \partial { \Theta  }_{ ij }^{ (l) } } J(\Theta )={ D }_{ ij }^{ (l) }=\frac { 1 }{ m } { \Delta  }_{ ij }^{ (l) }+\frac { \lambda  }{ m } { \Theta  }_{ ij }^{ (l) }$ 

where $\lambda $ is the Regularization parameter, learn rate was set as 1.0

In [58]:
def nn_train(ann, train_set, max_iter):
    Lambda = 10.01 # Regularization parameter (to avoid overfit issue)
    m = train_set.count()
    for iteration in range(max_iter):
        eval_res = train_set.map(lambda x: ann_train_eval((ann.w1, ann.w2), x))
        
        # calculate derivation of weights via average bp results
        dw1 = eval_res.map(lambda x: x[0]).reduce(add) / train_set.count()
        dw2 = eval_res.map(lambda x: x[1]).reduce(add) / train_set.count()
        
        # mean error sololy for display
        mean_err = eval_res.map(lambda x: x[2]).reduce(add) / train_set.count()
        
        # for all the weights, you should not apply regulization on the first column since they are for bias
        for i in range(ann.w1.shape[0]):
            ann.w1[i, 0] = ann.w1[i, 0] - dw1[i, 0]
        for i in range(ann.w1.shape[0]):
            for j in range(1, ann.w1.shape[1]):
                # here learn rate is 1.0, Lambda is the Regularization parameter
                ann.w1[i, j] = ann.w1[i, j] - (dw1[i, j] + (Lambda/m) * ann.w1[i, j])
        
        for i in range(ann.w2.shape[0]):
            ann.w2[i, 0] = ann.w2[i, 0] - dw2[i, 0]
        for i in range(ann.w2.shape[0]):
            for j in range(1, ann.w2.shape[1]):
                ann.w2[i, j] = ann.w2[i, j] - (dw2[i, j] + (Lambda/m) * ann.w2[i, j])
        if 0 == iteration % 10:
            print ("mean error p", mean_err)
    return ann
    

In [79]:
n = NN(400, 25, 10)
nt = nn_train(n, train_set, 400)


('mean error p', array([[ 0.48310611]]))
('mean error p', array([[ 0.89350778]]))
('mean error p', array([[ 0.87246453]]))
('mean error p', array([[ 0.81690122]]))
('mean error p', array([[ 0.73897448]]))
('mean error p', array([[ 0.66864562]]))
('mean error p', array([[ 0.61160388]]))
('mean error p', array([[ 0.56483364]]))
('mean error p', array([[ 0.52553856]]))
('mean error p', array([[ 0.49205851]]))
('mean error p', array([[ 0.46344238]]))
('mean error p', array([[ 0.43902272]]))
('mean error p', array([[ 0.41819046]]))
('mean error p', array([[ 0.40035382]]))
('mean error p', array([[ 0.38497411]]))
('mean error p', array([[ 0.37159645]]))
('mean error p', array([[ 0.35985521]]))
('mean error p', array([[ 0.34946377]]))
('mean error p', array([[ 0.34019929]]))
('mean error p', array([[ 0.33188837]]))
('mean error p', array([[ 0.32439524]]))
('mean error p', array([[ 0.31761243]]))
('mean error p', array([[ 0.3114536]]))
('mean error p', array([[ 0.30584808]]))
('mean error p', 

In [82]:
# test trained neural network on train dataset
n_set = train_set.count()

val_res = train_set.map(lambda x : 1 + np.argmax(nn_predict(nt, x[0]))).collect()
actual_res = train_set.collect()

accurate = 0
for idx in range(n_val_set):
    if val_res[idx] == actual_res[idx][1]:
        accurate += 1
print("train set accuracy: {0} %".format(100.0 * accurate / n_val_set))
    

train set accuracy: 97.6878612717 %


In [83]:
# test trained neural network on validation data set
n_val_set = validate_set.count()

val_res = validate_set.map(lambda x : 1 + np.argmax(nn_predict(nt, x[0]))).collect()
actual_res = validate_set.collect()

accurate = 0
for idx in range(n_val_set):
    if val_res[idx] == actual_res[idx][1]:
        accurate += 1
print("validation set accuracy: {0} %".format(100.0 * accurate / n_val_set))
    

validation set accuracy: 88.4393063584 %


In [84]:
# test trained neural network on validation data set
n_val_set = test_set.count()

val_res = test_set.map(lambda x : 1 + np.argmax(nn_predict(nt, x[0]))).collect()
actual_res = test_set.collect()

accurate = 0
for idx in range(n_val_set):
    if val_res[idx] == actual_res[idx][1]:
        accurate += 1
print("test set accuracy: {0} %".format(100.0 * accurate / n_val_set))

test set accuracy: 91.2579957356 %
