In [5]:
import numpy as np # only depency
import numpy # import only for types

## The main code

In [111]:
def nonlin(x: numpy.ndarray, deriv: bool = False) -> numpy.ndarray:
  """
  This is sigmoid function. The sigmod function maps any value
  to a value between 0 and 1.
  In our situation we use it to convert numbers to probabilities.

  Parameters:
  ----------
  x : numpy.ndarray
    Values for calc sigmoid. 
  deriv : bool
    Flag for generating the derivative of a sigmoid. 
  """
  if deriv == True:
    return x * (1-x)
  return 1 / (1+np.exp(-x))


In [116]:
# Input dataset
# This inits learning dataset as a numpy matrix. Each row is a single training example.
# Each column corresponds to one of input nodes.
# Thus, this model have 3 input nodes to the network and 4 training examples.
X = np.array([
  [0, 0, 1],
  [0, 1, 1],
  [1, 0, 1],
  [1, 1, 1],
])
print(X)

# Output dataset
# This inits output dataset, answers. In this case dataset generated horizontally(with 1 row and 4 cols).
# "T" is the transpose method. After the transpose, this matrix has 4 rows with 1 column.
# So, this network has 3 inputs(dataset above) and 1 output.
y = np.array([[0, 0, 1, 1]]).T
print(y)

[[0 0 1]
 [0 1 1]
 [1 0 1]
 [1 1 1]]
[[0]
 [0]
 [1]
 [1]]


In [135]:
def learning(iter_cnt: int = 10_000) -> None:
  """
  Learns the network. Here collected all training logic. Main function.
  Parameters:
  ----------
  iter_cnt : int
    Count of optimization iterations.
  """
  # Seed random numbers to make calculation deterministic (it seems good practice)
  # After this, numbers will still be randomly distributed,
  # but they'll be randomly distributed in exactly the same way each time you train. 
  # This allows us easier see how changes affect the network.
  np.random.seed(1)

  # Initialize weights randomly with mean 0
  # Since we only have 2 layers (input and output), we only need one matrix of weights to connect them.
  # Its dimension is (3, 1) because we have 3 inputs and 1 output.
  # Also inits weights as mean of zero. It's best practice  to have a mean of zero in weight initialization.
  # Another note is that "neural network" is really just this matrix.
  # All of the learning is stored in the synapse0 matrix.
  synapse0 = 2 * np.random.random((3, 1)) - 1

  # This loop iterates multimple times over the training code to optimize network to the dataset.
  for _ in range(iter_cnt):
    # Forward propagation
    # Since our first layer, layer0, is simply our data.
    # This will be a "full batch" training, because we're going to process all of training examples in this implementation.
    layer0 = X

    # This is prediction step.
    # This line contains 2 steps. The first matrix multiplies layer0 by synapse0.
    # The second passes output through sigmoid function. 
    # Since we leaded in 4 training examples, we ended up with 4 guesses for the correct answer, a (4 x 1) matrix.
    # Each output corresponds with the network's guess for a given input. 
    layer1 = nonlin(np.dot(layer0, synapse0))

    # Calc error
    # Now layer1 has a "guess" for each input. We can now compare how well it did
    # by substracting the true answer (y) from the guess (layer1).
    # So, layer1_error is just a vector of positive and negative numbers
    # reflecting how much the network missed.
    layer1_error = y - layer1

    # Multiply how much we missed by the slope of the sigmoid at the values in layer1
    # When we multiply the "slopes" by the error, we are reducing the error of high confidence predictions.
    # If the slope was really shallow (close to 0), then the network either had a very high value, or a very low value.
    # This means that the network was quite confident one way or the other.
    # However, if the network guessed something close to (x=0, y=0.5) then it isn't very confident.
    # We update these "wishy-washy" predictions most heavily, 
    # and we tend to leave the confident ones alone by multiplying them by a number close to 0.
    layer1_delta = layer1_error * nonlin(layer1, True)

    # Update weights
    # It computes the weight updates for each weight for each training example,
    # sums them, and updates the weights, all in a simple line.
    synapse0 += np.dot(layer0.T, layer1_delta)

  print("Training result:")
  print(layer1)
  print()
  print("Values that we trying to predict:")
  print(y)

## Examples

In [153]:
%%time

print("\t\t\t\t__Default settings__")
learning()

for iter_cnt in range(1, 11_000, 1_000):
  print(f"\t\t\t\t__{iter_cnt} optimize iterations__")
  learning(iter_cnt=iter_cnt)

				__Default settings__
Training result:
[[0.00966449]
 [0.00786506]
 [0.99358898]
 [0.99211957]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__1 optimize iterations__
Training result:
[[0.2689864 ]
 [0.36375058]
 [0.23762817]
 [0.3262757 ]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__1001 optimize iterations__
Training result:
[[0.03176745]
 [0.02575143]
 [0.97907779]
 [0.97416005]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__2001 optimize iterations__
Training result:
[[0.02210122]
 [0.01793507]
 [0.985409  ]
 [0.98200541]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__3001 optimize iterations__
Training result:
[[0.0179128 ]
 [0.01454746]
 [0.98815758]
 [0.98540875]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__4001 optimize iterations__
Training result:
[[0.01544409]
 [0.0125494 ]
 [0.98978021]
 [0.98741602]]

Values that we trying to predict:
[[0]
 [0]
 [1]
 [1]]
				__5001 optimize iterations__
