In [1]:
import numpy as np # only depency
import numpy # import only for types

## The main code of 3 Layer Neural Network

In [2]:
def nonlin(x: numpy.ndarray, deriv: bool = False) -> numpy.ndarray:
  """
  This is sigmoid function. The sigmod function maps any value
  to a value between 0 and 1.
  In our situation we use it to convert numbers to probabilities.

  Parameters:
  ----------
  x : numpy.ndarray
    Values for calc sigmoid. 
  deriv : bool
    Flag for generating the derivative of a sigmoid. 
  """
  if deriv == True:
    return x * (1-x)
  return 1 / (1+np.exp(-x))

In [3]:
# Input dataset
X = np.array([
  [0, 0, 1],
  [0, 1, 1],
  [1, 0, 1],
  [1, 1, 1],
])
print(X)

# Output dataset
y = np.array([
  [0],
  [1],
  [1],
  [0]
])
print(y)

[[0 0 1]
 [0 1 1]
 [1 0 1]
 [1 1 1]]
[[0]
 [1]
 [1]
 [0]]


In [18]:
def training(iter_cnt : int = 60_000):
  """
  Learns the network. Here collected all training logic. Main function.
  Parameters:
  ----------
  iter_cnt : int
    Count of optimization iterations.
  """
  # Seed random numbers to make calculation deterministic (it seems good practice)
  # After this, numbers will still be randomly distributed,
  # but they'll be randomly distributed in exactly the same way each time you train. 
  # This allows us easier see how changes affect the network.
  np.random.seed(1)

  # Initialize weights randomly with mean 0.
  # First layer of weights, senapse0, connecting layer0 to layer1.
  # Its dimension is (3, 4) because we have 3 inputs and 4 outputs.
  # Combine values into something that can have a one-to-one relationship.
  synapse0 = 2 * np.random.random((3, 4)) - 1
  # Second layer of wights, synapse 1 connecting layer1 to layer2.
  # Its demnsion is (4, 1) because result of layer1 is (4, 1) matrix.
  synapse1 = 2 * np.random.random((4, 1)) - 1

  for _ in range(iter_cnt):
    # Feed forward through layers 0, 1, and 2
    layer0 = X # Since our first layer, layer0, is simply our data.
    layer1 = nonlin(np.dot(layer0, synapse0)) # second layer, otherwise known a hidden layer
    layer2 = nonlin(np.dot(layer1, synapse1)) # thrid, final layer, which is our hypothesis, and should approximate the correct answer as we train.

    # Calc error
    # Now layer2 has a "guess" for each input. We can now compare how well it did
    # by substracting the true answer (y) from the guess (layer2).
    # So, layer2_error is just a vector of positive and negative numbers
    # reflecting how much the network missed.
    layer2_error = y - layer2

    # Monitor error
    if _ % 10_000 == 0:
      print("Error:", str(np.mean(np.abs(layer2_error))))

    # This is the error of the network scaled by the confidence.
    # It's almost identical to the error except that very confident errors are muted.
    layer2_delta = layer2_error * nonlin(layer2, deriv=True)

    # How much did each layer1 value contribute to the layer2 error (according to the weights)?
    # Weighting layer2_delta by the weights in synapse1,
    # we can calculate the error in the middle/hidden layer.
    # Uses the "confidence weighted error" from layer2 to estabish an error for layer1.
    # To do this, it simply sends the error across the weights from layer2 to layer1.
    # This gives what you could call a "contribution weighted error" because we learn how much each node value in l1 "contributed" to the error in l2.
    # This step is called "backpropagating" and is the namesake of the algorithm.
    layer1_error = layer2_delta.dot(synapse1.T)

    # This is the layer1 error of the network scaled by the confidence.
    # Again, it's almost identical to the layer1_error except that confident errors are muted.
    layer1_delta = layer1_error * nonlin(layer1, deriv=True)

    # Update weights
    # It computes the weight updates for each weight for each training example,
    # sums them, and updates the weights, all in a simple line.
    synapse1 += layer1.T.dot(layer2_delta)
    synapse0 += layer0.T.dot(layer1_delta)
  
  print("\nPrediction result:")
  print(layer2)
  print("Values that we trying predict:")
  print(y)


## Examples

In [19]:
%%time
training()

Error: 0.4964100319027255
Error: 0.008584525653247157
Error: 0.00578945986250781
Error: 0.004629176776769985
Error: 0.00395876528027365
Error: 0.0035101225678616753

Prediction result:
[[0.00260572]
 [0.99672209]
 [0.99701711]
 [0.00386759]]
Values that we trying predict:
[[0]
 [1]
 [1]
 [0]]
CPU times: user 646 ms, sys: 9.9 ms, total: 656 ms
Wall time: 658 ms
