"""
Simplistic implementation of the two-layer neural network.
Training method is stochastic (online) gradient descent with momentum.

As an example it computes XOR for given input.

Some details:
- tanh activation for hidden layer
- sigmoid activation for output layer
- cross-entropy loss

Less than 100 lines of active code.

"""

import numpy as np
import time

n_hidden = 10
n_in = 10
n_out = 10
n_samples = 300

learning_rate = 0.01
momentum = 0.9

#np.random.seed(0)

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def tanh_prime(x):
    return  1 - np.tanh(x)**2

def train(x, t, V, W, bv, bw):

    # forward
    A = np.dot(x, V) + bv
    Z = np.tanh(A)

    B = np.dot(Z, W) + bw
    Y = sigmoid(B)

    # backward
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)

    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)

    loss = -np.mean ( t * np.log(Y) + (1 - t) * np.log(1 - Y) )

    # Note that we use error for each layer as a gradient
    # for biases

    return  loss, (dV, dW, Ev, Ew)

def predict(x, V, W, bv, bw):
    A = np.dot(x, V) + bv
    B = np.dot(np.tanh(A), W) + bw
    return (sigmoid(B) > 0.5).astype(int)

# Setup initial parameters
# Note that initialization is cruxial for first-order methods!

V = np.random.normal(scale=0.1, size=(n_in, n_hidden))
W = np.random.normal(scale=0.1, size=(n_hidden, n_out))

bv = np.zeros(n_hidden)
bw = np.zeros(n_out)

params = [V,W,bv,bw]

# Generate some data

X = np.random.binomial(1, 0.5, (n_samples, n_in))
T = X ^ 1

# Train
for epoch in range(100):
    err = []
    upd = [0]*len(params)

    t0 = time.clock()
    for i in range(X.shape[0]):
        loss, grad = train(X[i], T[i], *params)

        for j in range(len(params)):
            params[j] -= upd[j]

        for j in range(len(params)):
            upd[j] = learning_rate * grad[j] + momentum * upd[j]

        err.append( loss )

    print ("Epoch: %d, Loss: %.8f, Time: %.4fs" % (
                epoch, np.mean( err ), time.clock()-t0 ))

# Try to predict something

x = np.random.binomial(1, 0.5, n_in)
print ("XOR prediction:")
print (x)
print (predict(x, *params))


In [63]:
import numpy as np
import time

In [64]:
layers = [
    {"layerName":"input", "numberOfNodes":10},
    {"layerName":"hidden0", "numberOfNodes":10},
    {"layerName":"hidden1", "numberOfNodes":10},
    {"layerName":"output", "numberOfNodes":10},
]

In [65]:

numberOfSamples = 300
inputSamples = np.random.binomial(1, 0.5, (numberOfSamples, layers[0]["numberOfNodes"]))
targetSamples = inputSamples ^ 1 #create traget samples by simply invert the input samples

learning_rate = 0.01
momentum = 0.9


In [66]:

# debug
print(inputSamples)
print(targetSamples)


[[1 0 0 ..., 0 1 0]
 [1 0 0 ..., 0 1 1]
 [0 0 0 ..., 0 0 1]
 ..., 
 [0 0 1 ..., 1 1 1]
 [1 0 0 ..., 0 0 0]
 [0 1 1 ..., 1 1 1]]
[[0 1 1 ..., 1 0 1]
 [0 1 1 ..., 1 0 0]
 [1 1 1 ..., 1 1 0]
 ..., 
 [1 1 0 ..., 0 0 0]
 [0 1 1 ..., 1 1 1]
 [1 0 0 ..., 0 0 0]]


In [67]:
# initialize hidden layers
for i in range(1, len(layers)-1): # only iterate over the mid layers, leave out first and last layer (input and output)
    layers[i]["weights"] = np.random.normal(scale=0.1, size=(layers[i-1]["numberOfNodes"], layers[i]["numberOfNodes"]))
    layers[i]["bias"] = np.zeros(layers[i+1]["numberOfNodes"])


In [68]:

# debug

# print weights and bias of hidden layers
for i in range(1, len(layers)-1):
    print("hidden layer " + str(i))
    print("weights:")
    print(layers[i]["weights"])
    print("bias:")
    print(layers[i]["bias"])
    print("")


hidden layer 1
weights:
[[-0.03146092 -0.07319642 -0.00839339 -0.11188275 -0.07475475 -0.12557313
   0.12359797 -0.12568176  0.09978646  0.13069654]
 [ 0.16960256  0.00952491 -0.01594721 -0.01028723  0.21224227  0.00053433
  -0.16479698  0.01146405  0.00662584  0.0674725 ]
 [-0.05950089 -0.07806738  0.03552074  0.01983741 -0.15436149  0.01321648
  -0.03951212 -0.11676827  0.03413702 -0.08435535]
 [-0.05144317 -0.03858733  0.02636307 -0.00984572  0.12660463 -0.17338015
   0.22112147  0.02182099  0.01350749 -0.03970833]
 [ 0.03441612 -0.09416352 -0.03612493 -0.05476971 -0.01149447  0.10712293
  -0.08803129  0.0098169   0.19824704 -0.06478523]
 [ 0.12262981 -0.00425057 -0.25177428  0.01596778 -0.05479156 -0.10229381
  -0.09902942 -0.00288189  0.04311854  0.11694531]
 [-0.05330592  0.09008884 -0.11226976 -0.15375181 -0.06201079  0.056049
   0.10238065  0.03852106  0.18834893 -0.04070274]
 [ 0.10987134  0.11937343 -0.08569935 -0.05423213  0.03697317 -0.03186419
  -0.10504311 -0.03306947  0.

In [69]:
# define activation functions

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def tanh_prime(x):
    return  1 - np.tanh(x)**2


In [70]:

def calc_loss_and_weight(inputSamples, targetSamples, listOfHiddenLayers):
    return false
    

def train_old(x, t, V, W, bv, bw):

    # forward propagation
    A = np.dot(x, V) + bv
    Z = np.tanh(A)

    B = np.dot(Z, W) + bw
    Y = sigmoid(B)

    # backward propagation
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)

    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)

    loss = -np.mean ( t * np.log(Y) + (1 - t) * np.log(1 - Y) )

    # Note that we use error for each layer as a gradient
    # for biases

    return  loss, (dV, dW, Ev, Ew)


In [71]:
# old code


def train(x, t, V, W, bv, bw):

    # forward
    A = np.dot(x, V) + bv
    Z = np.tanh(A)

    B = np.dot(Z, W) + bw
    Y = sigmoid(B)

    # backward
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)

    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)

    loss = -np.mean ( t * np.log(Y) + (1 - t) * np.log(1 - Y) )

    # Note that we use error for each layer as a gradient
    # for biases

    return  loss, (dV, dW, Ev, Ew)


In [72]:


def predict(x, V, W, bv, bw):
    A = np.dot(x, V) + bv
    B = np.dot(np.tanh(A), W) + bw
    return (sigmoid(B) > 0.5).astype(int)

params = [
    layers[1]["weights"],
    layers[2]["weights"],
    layers[1]["bias"],
    layers[2]["bias"],
]

X = inputSamples
T = targetSamples

# Train
for epoch in range(10):
    err = []
    upd = [0]*len(params)

    t0 = time.clock()
    for i in range(X.shape[0]):
        loss, grad = train(X[i], T[i], *params)
        
        # is there a mathematical reason for that???
        #for j in range(len(params)):
        #    params[j] -= upd[j]

        for j in range(len(params)):
            upd[j] = learning_rate * grad[j] + momentum * upd[j]
            
        for j in range(len(params)-2):
            params[j] -= upd[j]

        err.append( loss )

    print ("Epoch: %d, Loss: %.8f, Time: %.4fs" % (
                epoch, np.mean( err ), time.clock()-t0 ))

# Try to predict something

x = np.random.binomial(1, 0.5, layers[0]["numberOfNodes"])
print ("XOR prediction:")
print (x)
print (predict(x, *params))

Epoch: 0, Loss: 0.43270164, Time: 0.0484s
Epoch: 1, Loss: 0.13874419, Time: 0.0241s
Epoch: 2, Loss: 0.09389856, Time: 0.0387s
Epoch: 3, Loss: 0.07903321, Time: 0.0284s
Epoch: 4, Loss: 0.07263637, Time: 0.0374s
Epoch: 5, Loss: 0.06795481, Time: 0.0255s
Epoch: 6, Loss: 0.06384804, Time: 0.0374s
Epoch: 7, Loss: 0.06055031, Time: 0.0330s
Epoch: 8, Loss: 0.06302405, Time: 0.0438s
Epoch: 9, Loss: 0.07813340, Time: 0.0323s
XOR prediction:
[1 0 0 0 1 0 0 1 0 0]
[0 1 1 1 0 1 1 0 1 1]
