In [136]:
import numpy as np

In [137]:
X = np.eye(8)
#Each row is a learning example

Y = X
print(Y)

# We need to have 8 inputs (each number in a row of the matrix), a hidden layer with 3 nodes + bias and an output layer with 8 nodes

n_input = 8 # We need to have 8 inputs (each number in a row of the matrix)
n_hidden = 3
n_output = 8

[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [138]:
epsilon = 0.01

#Weights
np.random.seed(42)

w1 = np.random.normal(0, epsilon**2, (n_input, n_hidden)) # 8 x 3 Weights for input to hidden
print(w1)
w2 = np.random.normal(0, epsilon**2, (n_hidden, n_output)) # 3 x 8 Weights for hidden to output
#print(w2)

#Biases
b1 = np.random.normal(0, epsilon**2, n_hidden) # Bias for hidden layer (3)
b2 = np.random.normal(0, epsilon**2, n_output) # Bias for output layer (8)

[[ 4.96714153e-05 -1.38264301e-05  6.47688538e-05]
 [ 1.52302986e-04 -2.34153375e-05 -2.34136957e-05]
 [ 1.57921282e-04  7.67434729e-05 -4.69474386e-05]
 [ 5.42560044e-05 -4.63417693e-05 -4.65729754e-05]
 [ 2.41962272e-05 -1.91328024e-04 -1.72491783e-04]
 [-5.62287529e-05 -1.01283112e-04  3.14247333e-05]
 [-9.08024076e-05 -1.41230370e-04  1.46564877e-04]
 [-2.25776300e-05  6.75282047e-06 -1.42474819e-04]]


In [139]:
#Activation function, our single neuron corresponds exactly to the input-output mapping defined by logistic regression.
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#Used to compute the weighted input for a layer, which then is put into the activation function
def weighted_input(X, W, b):
    return np.dot(X, W) + b

#No regularization yet
#One-half squared-error cost function
def cost(Y, second_layer_output):
    return np.mean(0.5 * (Y - second_layer_output) ** 2)

In [140]:

#Could also have used a*(1-a) if we don't want to calculate z serperatly
def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

#Element-wise multiplication
def delta_output(Y, a, z):
    return (a-Y)*sigmoid_derivative(z)

#In order to do the matrix multiplication, we use this order because each row is a learning example
def delta_hidden(W, next_delta, z):
    return np.dot(next_delta, np.transpose(W))*sigmoid_derivative(z)

#In order to do the matrix multiplication, again we use this order because each row is a learning example
def gradient_weight(next_delta, a):
    return np.dot(np.transpose(a), next_delta)
    
#Sum over rows (axis=0), as again each row is a learning example
def gradient_bias(next_delta):
    return np.sum(next_delta, axis=0)


In [141]:
Delta_w1 = 0
Delta_w2 = 0
Delta_b1 = 0
Delta_b2 = 0

alpha = 0.001

for i in range(1000):
    #Hidden layer
    z2 = weighted_input(X, w1, b1)
    a2 = sigmoid(z2)
    
    #Output layer
    z3 = weighted_input(a2, w2, b2)
    a3 = sigmoid(z3)

    #Output layer
    delta3 = delta_output(Y, a3, z3)

    #Hidden layer
    delta2 = delta_hidden(w2, delta3, z2)
    
    #Gradient for weight is same shape as W1/W2 (8x3/3x8)
    #Gradient for bias is same shape as b1/b2 (1x3/1x8)
    Delta_w2 += gradient_weight(delta3, a2)
    Delta_b2 += gradient_bias(delta3)

    Delta_w1 += gradient_weight(delta2, X)
    Delta_b1 += gradient_bias(delta2)

    #Update weights, without regularization term
    w1 += -alpha*np.mean(Delta_w1)
    w2 += -alpha*np.mean(Delta_w2)

    #Update bias
    b1 += -alpha*np.mean(Delta_b1)
    b2 += -alpha*np.mean(Delta_b2)
    
    if i % 100 == 0:
            print("Loss after iteration", i, ":", cost(Y, a3))
             
            # print(w1)
            # print(w2)
            
    
           

Loss after iteration 0 : 0.1249971171330625
Loss after iteration 100 : 0.05999217289289761
Loss after iteration 200 : 0.06248039442734494
Loss after iteration 300 : 0.06249990535973206
Loss after iteration 400 : 0.06249999967287157
Loss after iteration 500 : 0.062499999999122355
Loss after iteration 600 : 0.06249999999999803
Loss after iteration 700 : 0.0625
Loss after iteration 800 : 0.0625
Loss after iteration 900 : 0.0625
