In [268]:
import numpy as np

In [269]:
X = np.eye(8)
#Each row is a learning example

Y = X
print(Y)

# We need to have 8 inputs (each number in a row of the matrix), a hidden layer with 3 nodes + bias and an output layer with 8 nodes

n_input = 8 # We need to have 8 inputs (each number in a row of the matrix)
n_hidden = 3
n_output = 8

[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [270]:
epsilon = 0.1

#Weights
np.random.seed(42)

w1 = np.random.normal(0, epsilon**2, (n_input, n_hidden)) # 8 x 3 Weights for input to hidden
print(w1)
w2 = np.random.normal(0, epsilon**2, (n_hidden, n_output)) # 3 x 8 Weights for hidden to output
#print(w2)

#Biases
b1 = np.random.normal(0, epsilon**2, n_hidden) # Bias for hidden layer (3)
b2 = np.random.normal(0, epsilon**2, n_output) # Bias for output layer (8)

[[ 0.00496714 -0.00138264  0.00647689]
 [ 0.0152303  -0.00234153 -0.00234137]
 [ 0.01579213  0.00767435 -0.00469474]
 [ 0.0054256  -0.00463418 -0.0046573 ]
 [ 0.00241962 -0.0191328  -0.01724918]
 [-0.00562288 -0.01012831  0.00314247]
 [-0.00908024 -0.01412304  0.01465649]
 [-0.00225776  0.00067528 -0.01424748]]


In [271]:
#Activation function, our single neuron corresponds exactly to the input-output mapping defined by logistic regression.
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#Used to compute the weighted input for a layer, which then is put into the activation function
def weighted_input(X, W, b):
    return np.dot(X, W) + b

#No regularization yet
#One-half squared-error cost function
def cost(Y, second_layer_output):
    return np.mean(0.5 * (Y - second_layer_output) ** 2)

In [272]:

#Could also have used a*(1-a) if we don't want to calculate z serperatly
def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

#Element-wise multiplication
def delta_output(Y, a, z):
    return (a-Y)*sigmoid_derivative(z)

#In order to do the matrix multiplication, we use this order because each row is a learning example
def delta_hidden(W, next_delta, z):
    return np.dot(next_delta, np.transpose(W))*sigmoid_derivative(z)

#In order to do the matrix multiplication, again we use this order because each row is a learning example
def gradient_weight(next_delta, a):
    return np.dot(np.transpose(a), next_delta)
    
#Sum over rows (axis=0), as again each row is a learning example
def gradient_bias(next_delta):
    return np.sum(next_delta, axis=0)


In [273]:

alpha = 0.5

for i in range(10001):
    
    #Input layer
    a1 = X

    #Hidden layer
    z2 = weighted_input(X, w1, b1)
    a2 = sigmoid(z2)
    
    #Output layer
    z3 = weighted_input(a2, w2, b2)
    a3 = sigmoid(z3)

    #Output layer
    delta3 = delta_output(Y, a3, z3)

    #Hidden layer
    delta2 = delta_hidden(w2, delta3, z2)
    
    #Gradient for weight is same shape as W1/W2 (8x3/3x8)
    #Gradient for bias is same shape as b1/b2 (1x3/1x8)
    Delta_w2 = gradient_weight(delta3, a2)
    Delta_b2 = gradient_bias(delta3)

    Delta_w1 = gradient_weight(delta2, X)
    Delta_b1 = gradient_bias(delta2)

    #Update weights, without regularization term
    #Devide by amount of learning examples
    w1 += -alpha*(Delta_w1/8)
    w2 += -alpha*(Delta_w2/8)

    #Update bias
    b1 += -alpha*(Delta_b1/8)
    b2 += -alpha*(Delta_b2/8)
    
    if i % 1000 == 0:
            print("Loss after iteration", i, ":", cost(Y, a3))
             
            # print(w1)
            # print(w2)
            print(a3)
            
          
           

Loss after iteration 0 : 0.12471918594172093
[[0.49926481 0.49735264 0.50095755 0.50115934 0.49997571 0.49420559
  0.4962487  0.50472465]
 [0.49925726 0.49735304 0.50095031 0.50116414 0.49997988 0.49420886
  0.49624818 0.50473059]
 [0.4992559  0.4973462  0.50095522 0.50115707 0.49998315 0.49419755
  0.49624033 0.50473092]
 [0.49925955 0.49735363 0.50095635 0.50116402 0.4999854  0.49421449
  0.49625443 0.50471743]
 [0.49925488 0.49736165 0.50095197 0.50117674 0.49999628 0.49423846
  0.49627122 0.50470385]
 [0.49926695 0.49735733 0.50096091 0.50116415 0.49998163 0.49421973
  0.4962609  0.50470912]
 [0.49927348 0.49736096 0.50096051 0.50116422 0.49997176 0.49422007
  0.4962622  0.50471223]
 [0.49925769 0.49734856 0.5009653  0.50115997 0.49999784 0.49421371
  0.49625567 0.50470285]]
Loss after iteration 1000 : 0.054680272743123266
[[0.12449936 0.12443852 0.12451027 0.12455369 0.12471104 0.1246134
  0.12470286 0.12447258]
 [0.12449359 0.12454301 0.12453527 0.12465687 0.12481934 0.12466507
 