<a href="https://colab.research.google.com/github/nivgold/Neural-Network-From-Scratch/blob/main/neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports cell

import numpy as np
import tensorflow as tf

# Section 1 - Forward Propagation

In [None]:
def initialize_parameters(layer_dims):
  """
  layer_dims: ndarray of shape(num_of_layers, )
  """
  layer_weights = {}
  for layer_index in range(1, len(layer_dims)):
    prev_layer_dim = layer_dims[layer_index-1]
    current_layer_dim = layer_dims[layer_index]

    # initialize weight and bias values
    layer_weights[f"W{layer_index}"] = np.random.randn(current_layer_dim, prev_layer_dim) * np.sqrt(2/prev_layer_dim)
    layer_weights[f"B{layer_index}"] = np.zeros((current_layer_dim, 1))

  return layer_weights


def linear_forward(A, W, B):
  """
  A: ndarray of shape (size_of_prev_layer, num_of_examples)
  W: ndarray of shape (size_of_current_layer, size_of_prev_layer)
  B: ndarray of shape (size_of_current_layer, )
  """

  # Calculate Z values for current layer
  Z = W @ A + B
  linear_cache = {
      'A': A,
      'W': W,
      'B': B
  }
  return Z, linear_cache


def softmax(Z):
  """
  Z: ndarray of shape (size_of_current_layer, num_of_examples)
  """
  exp_max = np.exp(Z - np.max(Z,axis=0,keepdims=True))
  A = exp_max/np.sum(exp_max,axis=0,keepdims=True) 

  return A, Z


def relu(Z):
  """
  Z: ndarray of shape (size_of_current_layer, num_of_examples)
  """
  A = np.where(Z > 0, Z, 0)

  return A, Z


def linear_activation_forward(A_prev, W, B, activation):
  """
  A_prev: ndarray of shape (size_of_prev_layer, num_of_examples)
  W: ndarray of shape (size_of_prev_layer, size_of_current_layer)
  B: ndarray of shape (size_of_current_layer, )
  activation: an implemented python's activation function
  """

  # Calculate Z and activation values for current layer
  Z, linear_cache = linear_forward(A_prev, W, B)
  A, activation_cache = activation(Z)
  
  linear_cache.update({"Z": activation_cache})
  return A, linear_cache


def L_model_forward(X, parameters, use_batchnorm):
  """
  X: ndarray of shape (input_size, num_of_examples)
  parameters: python's dictionary containing the initilized weights of the network
  use_batchnorm: a boolean flag used to determine whether to apply batchnorm after activation
  """
  num_of_layers = int(len(parameters.keys())/2)

  A_prev = X
  W = None
  B = None
  caches = []
  # the forward-propagation until the output layer
  for layer_index in range(1, num_of_layers):
    W = parameters[f"W{layer_index}"]
    B = parameters[f"B{layer_index}"]
    A_new, cache = linear_activation_forward(A_prev, W, B, relu)
    if use_batchnorm:
      A_new = apply_batchnorm(A_new)

    # add cache to caches and update A_prev
    caches.append(cache)
    A_prev = A_new
  
  # forward the output layer
  W_output = parameters[f"W{num_of_layers}"]
  B_output = parameters[f"B{num_of_layers}"]
  AL, cache_output = linear_activation_forward(A_prev, W_output, B_output, softmax)
  
  caches.append(cache_output)
  return AL, caches


def compute_cost(AL, Y):
  """
  AL: ndarray of shape (num_of_classes, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  """
  epsilon = 1e-12
  # Calculate cost based on the loss function:
  loss = np.sum(np.multiply(Y, np.log(AL+epsilon)), axis=0)
  cost = -np.mean(loss)
  return cost


def apply_batchnorm(A):
  """
  A: ndarray of shape (size_of_current_layer, num_of_examples)
  """
  epsilon = 7e-1
  # Calculating maen and var of currnet layer-batch activations values 
  mu = np.mean(A, axis=1, keepdims=True)
  var = np.var(A, axis=1, keepdims=True)
  
  # Normalizing activations values 
  NA = (A - mu) / (np.sqrt(var + epsilon))
  return NA

# Section 2 - Backpropagation

In [None]:
def linear_backward(dZ, cache):
  """
  dZ: ndarray of shape (size_of_current_layer, num_of_examples)
  cache: tuple of values (A_prev, W, B) coming from the forward propagation in the current layer
  """
  # A_prev: (size_of_prev_layer, num_of_examples), W: (size_of_current_layer, size_of_prev_layer), B: (size_of_current_layer)
  A_prev, W, B = cache
  num_of_examples = A_prev.shape[1]

  # Z = A_prev @ W + b, calculating grads for W, A_prev, B of current layer
  dA_prev = W.T @ dZ
  dW = (dZ @ A_prev.T) / num_of_examples
  dB = np.mean(dZ, axis=1, keepdims=True)
  return dA_prev, dW, dB

def linear_activation_backward(dA, cache, activation):
  """
  dA: ndarray of shape (size_of_current_layer, num_of_examples)
  cache: contains both the linear cache (dict of A_prev, W, B) and the activations cache (Z)
  activation: the activation function to be used (str, either "softmax" or "relu")
  """
  linear_cache = (cache['A'], cache['W'], cache['B']) 
  # Separating 2 cased activations: Relu, Softmax
  if activation == 'relu':
    dZ = relu_backward(dA, cache['Z'])
  elif activation == 'softmax':
    dZ = softmax_backward(dA, cache['AL'])

  return linear_backward(dZ, linear_cache)
  
def relu_backward(dA, activation_cache):
  """
  dA: ndarray of shape (size_of_current_layer, num_of_examples)
  activation_cache: containing the calcualted Z value from the forward propagation
  """
  # Calculating current layer dZ values based on dA (relu)
  Z = activation_cache
  dZ = np.where(Z > 0, dA, 0)
  return dZ

def softmax_backward(dA, activation_cache):
  """
  dA: Y
  activation_cache: containing the calcualted Z value from the forward propagation
  """
  # Calculating current layer dZ values based on dA (softmax)
  AL = activation_cache
  Y = dA
  dZ = AL - Y
  return dZ

def L_model_backward(AL, Y, caches):
  """
  AL: ndarray of shape (num_of_classes, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  caches: all of the caches from the forward propagation
  """
  grads = {}
  num_of_layers = len(caches)

  # calculating the gradients of the output layer
  caches[-1].update({"AL": AL})
  dA, dW, dB = linear_activation_backward(Y, caches[-1], "softmax")
  grads[f"dA{num_of_layers}"] = dA
  grads[f"dW{num_of_layers}"] = dW
  grads[f"dB{num_of_layers}"] = dB

  # calculating the gradients of all of the other hidden layers
  for cache_index in reversed(range(num_of_layers-1)):
    dA, dW, dB = linear_activation_backward(dA, caches[cache_index], "relu")
    grads[f"dA{cache_index+1}"] = dA
    grads[f"dW{cache_index+1}"] = dW
    grads[f"dB{cache_index+1}"] = dB
  
  return grads

def update_parameters(parameters, grads, learning_rate):
  """
  parameters: python's dictionary containing the initilized weights of the network
  grads: python's dictionary containnig the gradients of all of the parameters
  """
  updated_parameters = {}
  # Update weights andbias values:
  for parameter in parameters.keys():
    updated_parameters[parameter] = parameters[parameter] - learning_rate * grads[f"d{parameter}"]
  
  return updated_parameters

# Section 3 - Training Loop

In [None]:
def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, batch_size):
  """
  X: ndarray of shape (height*weight, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  layer_dims: ndarray of shape (num_of_layers)
  batch_size: the number of examples in a single training batch
  """

  def train_val_split(X, Y, train_size=0.8):
    """
    The function split randomly the training data into train and validation set with a ratio given by train_size parameter

    Parameters
    ----------
    X: ndarray of shape (height*weight, num_of_examples)
    Y: ndarray of shape (num_of_classes, num_of_examples)
    train_size: float between 0-1, represent the training size

    Returns
    -------
    x_train: The actual training set data, numpy array of shape (input_size, num_training_examples)
    y_train: The actual training labels, numpy array of shape (num_of_classes, num_training_examples)
    x_validation: The validation set data, numpy array of shape (input_size, num_validation_examples)
    y_validation: The validation labels, numpy array of shape (num_of_classes, num_validation_examples)
    """
    num_of_examples = X.shape[1]
    indices = np.random.permutation(num_of_examples)
    num_of_train_examples = int(num_of_examples*train_size)
    # Taking indexes of train and validation
    training_idx, validation_idx = indices[:num_of_train_examples], indices[num_of_train_examples:]
    # Splitting the data:
    x_train, y_train, x_validation, y_validation = X[:, training_idx], Y[:, training_idx], X[:, validation_idx], Y[:, validation_idx]
    return x_train, y_train, x_validation, y_validation
  
  # Splitting the data:
  x_train, y_train, x_validation, y_validation = train_val_split(X, Y)

  print("training examples: ", x_train.shape[1])
  print("validation examples: ", x_validation.shape[1])
  
  # initialize
  parameters = initialize_parameters(layer_dims)
  costs = []
  
  # report variables
  training_iteration = 0
  epochs = 0

  # stopping criterion variables
  ITERATION_TO_IMPROVE = 100
  MAX_SMALL_IMPROVEMENT = 0.001
  stop = False
  last_cost = np.inf

  # storing the best weights achieved so far
  last_parameters = parameters

  while not stop:
    epochs += 1

    # split the data to batches
    for batch_data, batch_targets in list(zip(np.array_split(x_train, x_train.shape[1]/batch_size, axis=1), np.array_split(y_train, y_train.shape[1]/batch_size, axis=1))):
      training_iteration+=1
      # Update params by batch values:
      AL, caches = L_model_forward(batch_data, parameters, False)
      grads = L_model_backward(AL, batch_targets, caches)
      parameters = update_parameters(parameters, grads, learning_rate)

      # computing validation set cost
      validation_AL, validation_caches = L_model_forward(x_validation, parameters, False)
      validation_set_cost = compute_cost(validation_AL, y_validation)

      if training_iteration % 100 == 0:
        # save cost
        costs.append(validation_set_cost)
        print_progress = int((training_iteration/num_iterations)*100)
        print("="+"="*print_progress+">"+"."*(100-print_progress))
        print(f"training iteration: {training_iteration}/{num_iterations}")
        print("validation cost: ", validation_set_cost)
        print("-"*102)
        
      if validation_set_cost + MAX_SMALL_IMPROVEMENT >= last_cost:
        ITERATION_TO_IMPROVE -= 1
      else:
        # update last cost, save best weights so far, and reset the iteration to improve
        last_cost = validation_set_cost
        last_parameters = parameters
        ITERATION_TO_IMPROVE = 100
      
      if ITERATION_TO_IMPROVE == 0:
        # stop training if had `ITERATION_TO_IMPROVE` iteration without or with `MAX_SMAL_IMPROVEMENT` improvement
        stop=True
        break
    
      if training_iteration == num_iterations:
        stop=True
        break

  print(f"Training done after {epochs} epochs and {training_iteration}/{num_iterations} iterations")
  final_train_acc = predict(x_train, y_train, last_parameters)
  final_validation_acc = predict(x_validation, y_validation, last_parameters)
  print("Final Train Accuracy: {:.3f}".format(final_train_acc*100))
  print("Final Validation Accuracy: {:.3f}".format(final_validation_acc*100))
  return last_parameters, costs


def predict(X, Y, parameters):
  """
  X: ndarray of shape (height*weight, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  parameters: network learned weights
  """
  # shape: (num_of_classes, num_of_examples)
  outputs, caches = L_model_forward(X, parameters, False)
  # shape: (num_of_classes, num_of_examples)
  preds = np.zeros_like(outputs)
  # shape: (num_of_examples)
  network_argmax = np.argmax(outputs, axis=0)
  preds[network_argmax, np.arange(preds.shape[1])] = 1

  # calculate accuracy
  correct_class = 0
  num_of_examples = Y.shape[1]
  for sample_index in range(num_of_examples):
    sample_pred = preds[:, sample_index]
    sample_target = Y[:, sample_index]
    if np.array_equal(sample_pred, sample_target):
      correct_class += 1
  
  accuracy = correct_class / num_of_examples
  return accuracy

# Section 4 - Regular Training Phase

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path='mnist.npz')

# flatten the image
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# normalize the input
x_train = x_train/255
x_test = x_test/255

def encode_labels(y):
  """
  Performs one-hot encoding to a label vector.

  Parameters
  ----------
  y: ndarray of shape(num_of_examples, )

  Returns
  -------
  y_encoded: The one-hot encoding of the given label vector y.
  """
  y_encoded = np.zeros((y.size, y.max()+1), dtype=int)
  y_encoded[np.arange(y.size), y] = 1
  return y_encoded

# encode the mnsit labels
y_train, y_test = encode_labels(y_train), encode_labels(y_test)
# define layer dims
layer_dims = np.array([x_train.shape[1], 20, 7, 5, 10])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
np.random.seed(42)
print("Start Training")
trained_params, costs = L_layer_model(x_train.T, y_train.T, layer_dims, 0.009, 10000, 64)

training examples:  48000
validation examples:  12000
==>...................................................................................................
training iteration: 100/10000
validation cost:  2.130688830167577
------------------------------------------------------------------------------------------------------
===>..................................................................................................
training iteration: 200/10000
validation cost:  1.9920518212247604
------------------------------------------------------------------------------------------------------
====>.................................................................................................
training iteration: 300/10000
validation cost:  1.8754808790988269
------------------------------------------------------------------------------------------------------
=====>................................................................................................
training iteration: 400/1

In [None]:
test_acc = predict(x_test.T, y_test.T, trained_params) * 100
print("Final Test Accuracy: {:.3f}".format(test_acc))

Final Test Accuracy: 90.840


# Section 5 - Batch Normalization

In [None]:
def L_layer_model_batchnorm(X, Y, layer_dims, learning_rate, num_iterations, batch_size):
  """
  X: ndarray of shape (height*weight, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  layer_dims: ndarray of shape (num_of_layers)
  batch_size: the number of examples in a single training batch
  """

  def train_val_split(X, Y, train_size=0.8):
    """
    The function split randomly the training data into train and validation set with a ratio given by train_size parameter

    Parameters
    ----------
    X: ndarray of shape (height*weight, num_of_examples)
    Y: ndarray of shape (num_of_classes, num_of_examples)
    train_size: float between 0-1, represent the training size

    Returns
    -------
    x_train: The actual training set data, numpy array of shape (input_size, num_training_examples)
    y_train: The actual training labels, numpy array of shape (num_of_classes, num_training_examples)
    x_validation: The validation set data, numpy array of shape (input_size, num_validation_examples)
    y_validation: The validation labels, numpy array of shape (num_of_classes, num_validation_examples)
    """
    num_of_examples = X.shape[1]
    indices = np.random.permutation(num_of_examples)
    num_of_train_examples = int(num_of_examples*train_size)
    # Taking indexes of train and validation
    training_idx, validation_idx = indices[:num_of_train_examples], indices[num_of_train_examples:]
    # Splitting the data:
    x_train, y_train, x_validation, y_validation = X[:, training_idx], Y[:, training_idx], X[:, validation_idx], Y[:, validation_idx]
    return x_train, y_train, x_validation, y_validation
  
  # Splitting the data:
  x_train, y_train, x_validation, y_validation = train_val_split(X, Y)

  print("training examples: ", x_train.shape[1])
  print("validation examples: ", x_validation.shape[1])
  
  # initialize
  parameters = initialize_parameters(layer_dims)
  costs = []
  
  # report variables
  training_iteration = 0
  epochs = 0

  # stopping criterion variables
  ITERATION_TO_IMPROVE = 100
  MAX_SMALL_IMPROVEMENT = 0.001
  stop = False
  last_cost = np.inf

  # storing the best weights achieved so far
  last_parameters = parameters

  while not stop:
    epochs += 1

    # split the data to batches
    for batch_data, batch_targets in list(zip(np.array_split(x_train, x_train.shape[1]/batch_size, axis=1), np.array_split(y_train, y_train.shape[1]/batch_size, axis=1))):
      training_iteration+=1
      
      # Update params by batch values:
      AL, caches = L_model_forward(batch_data, parameters, True)
      grads = L_model_backward(AL, batch_targets, caches)
      parameters = update_parameters(parameters, grads, learning_rate)

      # computing validation set cost
      validation_AL, validation_caches = L_model_forward(x_validation, parameters, True)
      validation_set_cost = compute_cost(validation_AL, y_validation)

      if training_iteration % 100 == 0:
        # save cost
        costs.append(validation_set_cost)
        print_progress = int((training_iteration/num_iterations)*100)
        print("="+"="*print_progress+">"+"."*(100-print_progress))
        print(f"training iteration: {training_iteration}/{num_iterations}")
        print("validation cost: ", validation_set_cost)
        print("-"*102)
        
      if validation_set_cost + MAX_SMALL_IMPROVEMENT >= last_cost:
        ITERATION_TO_IMPROVE -= 1
      else:
        # update last cost, save best weights so far, and reset the iteration to improve
        last_cost = validation_set_cost
        last_parameters = parameters
        ITERATION_TO_IMPROVE = 100
      
      if ITERATION_TO_IMPROVE == 0:
        # stop training if had `ITERATION_TO_IMPROVE` iteration without or with `MAX_SMAL_IMPROVEMENT` improvement
        stop=True
        break
    
      if training_iteration == num_iterations:
        stop=True
        break
  
  print(f"Batch Normalization Training done after {epochs} epochs and {training_iteration}/{num_iterations} iterations")
  final_train_acc = predict_batchnorm(x_train, y_train, last_parameters)
  final_validation_acc = predict_batchnorm(x_validation, y_validation, last_parameters)
  print("Batch Normalization Final Train Accuracy: {:.3f}".format(final_train_acc*100))
  print("Batch Normalization Final Validation Accuracy: {:.3f}".format(final_validation_acc*100))
  return last_parameters, costs

def predict_batchnorm(X, Y, parameters):
  """
  X: ndarray of shape (height*weight, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  parameters: network learned weights
  """
  # shape: (num_of_classes, num_of_examples)
  outputs, caches = L_model_forward(X, parameters, True)
  # shape: (num_of_classes, num_of_examples)
  preds = np.zeros_like(outputs)
  # shape: (num_of_examples)
  network_argmax = np.argmax(outputs, axis=0)
  preds[network_argmax, np.arange(preds.shape[1])] = 1

  # calculate accuracy
  correct_class = 0
  num_of_examples = Y.shape[1]
  for sample_index in range(num_of_examples):
    sample_pred = preds[:, sample_index]
    sample_target = Y[:, sample_index]
    if np.array_equal(sample_pred, sample_target):
      correct_class += 1
  
  accuracy = correct_class / num_of_examples
  return accuracy

In [None]:
np.random.seed(42)
print("Start Batch Normalization Training")
trained_params_batchnorm, costs_batchnorm = L_layer_model_batchnorm(x_train.T, y_train.T, layer_dims, 0.009, 10000, 64)

training examples:  48000
validation examples:  12000
==>...................................................................................................
training iteration: 100/10000
validation cost:  2.1160924118522866
------------------------------------------------------------------------------------------------------
===>..................................................................................................
training iteration: 200/10000
validation cost:  2.004428277426092
------------------------------------------------------------------------------------------------------
====>.................................................................................................
training iteration: 300/10000
validation cost:  1.9010776328349481
------------------------------------------------------------------------------------------------------
=====>................................................................................................
training iteration: 400/1

In [None]:
test_acc_batchnrom = predict_batchnorm(x_test.T, y_test.T, trained_params_batchnorm) * 100
print("Batch Normalization Final Test Accuracy: {:.3f}".format(test_acc_batchnrom))

Final Test Accuracy: 87.630


# Section 6 - Dropout

In [None]:
def apply_dropout(A_new, keep_prob):
  """
  A: ndarray of shape (size_of_current_layer, num_of_examples)
  keep_prob: float number indecating the keep probability in the dropout layer
  """
  # create the dropout matrix which indicate which neurons are turned off
  dropout_vector = np.random.rand(A_new.shape[0], A_new.shape[1]) < keep_prob
  # "activate" the dropout
  A_dropout = np.multiply(A_new, dropout_vector)
  # normalize the output from the dropout layer in order to reduce the effect of the turned off neurons
  A_dropout /= keep_prob
  dropout_cache = {
      "dropout_vector": dropout_vector,
      "keep_prob": keep_prob
  }
  return A_dropout, dropout_cache

def L_model_forward_dropout(X, parameters, use_batchnorm, dropout_keep_prob):
  """
  X: ndarray of shape (input_size, num_of_examples)
  parameters: python's dictionary containing the initilized weights of the network
  use_batchnorm: a boolean flag used to determine whether to apply batchnorm after activation - NOT SUPPORTED WITH DROPOUT
  dropout_keep_prob: indicating the keep probability for every neuron in every layer
  """
  num_of_layers = int(len(parameters.keys())/2)

  A_prev = X
  W = None
  B = None
  caches = []
  # the forward-propagation until the output layer
  for layer_index in range(1, num_of_layers):
    W = parameters[f"W{layer_index}"]
    B = parameters[f"B{layer_index}"]
    A_new, cache = linear_activation_forward(A_prev, W, B, relu)

    # call dropout
    A_dropout, dropout_cache = apply_dropout(A_new, dropout_keep_prob)
    cache.update(dropout_cache)

    # add cache to caches and update A_prev
    caches.append(cache)
    A_prev = A_dropout
  
  # forward the output layer
  W_output = parameters[f"W{num_of_layers}"]
  B_output = parameters[f"B{num_of_layers}"]
  AL, cache_output = linear_activation_forward(A_prev, W_output, B_output, softmax)
  
  caches.append(cache_output)
  return AL, caches

def dropout_backward(dA_dropout, dropout_cache):
  """
  dA_dropout: ndarray of shape: (size_of_current_layer, num_of_examples)
  dropout_cache: containing the dropout_vector and the keep_prob used in the forward propagation in this dropout layer
  """
  dropout_vector, keep_prob = dropout_cache
  dA = np.multiply(dA_dropout, dropout_vector*(1/keep_prob))
  return dA

def L_model_backward_dropout(AL, Y, caches):
  """
  AL: ndarray of shape (num_of_classes, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  caches: all of the caches from the forward propagation. In dropout implementation, it contains also the dropout cache (dropout_vector, A_new)
  """
  grads = {}
  num_of_layers = len(caches)

  # calculating the gradients of the output layer
  caches[-1].update({"AL": AL})
  dA_dropout, dW, dB = linear_activation_backward(Y, caches[-1], "softmax")
  grads[f"dA_dropout{num_of_layers}"] = dA_dropout
  grads[f"dW{num_of_layers}"] = dW
  grads[f"dB{num_of_layers}"] = dB

  # calculating the gradients of all of the other hidden layers
  for cache_index in reversed(range(num_of_layers-1)):
    dropout_cache = (caches[cache_index]['dropout_vector'], caches[cache_index]['keep_prob'])
    dA = dropout_backward(dA_dropout, dropout_cache)

    dA_dropout, dW, dB = linear_activation_backward(dA, caches[cache_index], "relu")
    grads[f"dA_dropout{cache_index+1}"] = dA_dropout
    grads[f"dW{cache_index+1}"] = dW
    grads[f"dB{cache_index+1}"] = dB
  
  return grads

def L_layer_model_dropout(X, Y, layer_dims, learning_rate, num_iterations, batch_size, dropout_keep_prob=0.8):
  """
  X: ndarray of shape (height*weight, num_of_examples)
  Y: ndarray of shape (num_of_classes, num_of_examples)
  layer_dims: ndarray of shape (num_of_layers)
  batch_size: the number of examples in a single training batch
  """

  def train_val_split(X, Y, train_size=0.8):
    """
    The function split randomly the training data into train and validation set with a ratio given by train_size parameter

    Parameters
    ----------
    X: ndarray of shape (height*weight, num_of_examples)
    Y: ndarray of shape (num_of_classes, num_of_examples)
    train_size: float between 0-1, represent the training size

    Returns
    -------
    x_train: The actual training set data, numpy array of shape (input_size, num_training_examples)
    y_train: The actual training labels, numpy array of shape (num_of_classes, num_training_examples)
    x_validation: The validation set data, numpy array of shape (input_size, num_validation_examples)
    y_validation: The validation labels, numpy array of shape (num_of_classes, num_validation_examples)
    """
    num_of_examples = X.shape[1]
    indices = np.random.permutation(num_of_examples)
    num_of_train_examples = int(num_of_examples*train_size)
    # Taking indexes of train and validation
    training_idx, validation_idx = indices[:num_of_train_examples], indices[num_of_train_examples:]
    # Splitting the data:
    x_train, y_train, x_validation, y_validation = X[:, training_idx], Y[:, training_idx], X[:, validation_idx], Y[:, validation_idx]
    return x_train, y_train, x_validation, y_validation
  
  # Splitting the data:
  x_train, y_train, x_validation, y_validation = train_val_split(X, Y)

  print("training examples: ", x_train.shape[1])
  print("validation examples: ", x_validation.shape[1])
  
  # initialize
  parameters = initialize_parameters(layer_dims)
  costs = []
  
  # report variables
  training_iteration = 0
  epochs = 0

  # stopping criterion variables
  ITERATION_TO_IMPROVE = 100
  MAX_SMALL_IMPROVEMENT = 0.001
  stop = False
  last_cost = np.inf

  # storing the best weights achieved so far
  last_parameters = parameters

  while not stop:
    epochs += 1

    # split the data to batches
    for batch_data, batch_targets in list(zip(np.array_split(x_train, x_train.shape[1]/batch_size, axis=1), np.array_split(y_train, y_train.shape[1]/batch_size, axis=1))):
      training_iteration+=1
      
      # Update params by batch values:
      AL, caches = L_model_forward_dropout(batch_data, parameters, False, dropout_keep_prob)
      grads = L_model_backward_dropout(AL, batch_targets, caches)
      parameters = update_parameters(parameters, grads, learning_rate)

      # computing validation set cost
      validation_AL, validation_caches = L_model_forward_dropout(x_validation, parameters, False, dropout_keep_prob)
      validation_set_cost = compute_cost(validation_AL, y_validation)

      if training_iteration % 100 == 0:
        # save cost
        costs.append(validation_set_cost)
        print_progress = int((training_iteration/num_iterations)*100)
        print("="+"="*print_progress+">"+"."*(100-print_progress))
        print(f"training iteration: {training_iteration}/{num_iterations}")
        print("validation cost: ", validation_set_cost)
        print("-"*102)
        
      if validation_set_cost + MAX_SMALL_IMPROVEMENT >= last_cost:
        ITERATION_TO_IMPROVE -= 1
      else:
        # update last cost, save best weights so far, and reset the iteration to improve
        last_cost = validation_set_cost
        last_parameters = parameters
        ITERATION_TO_IMPROVE = 100
      
      if ITERATION_TO_IMPROVE == 0:
        # stop training if had `ITERATION_TO_IMPROVE` iteration without or with `MAX_SMAL_IMPROVEMENT` improvement
        stop=True
        break
    
      if training_iteration == num_iterations:
        stop=True
        break


  print(f"Dropout Training done after {epochs} epochs and {training_iteration}/{num_iterations} iterations")
  final_train_acc = predict(x_train, y_train, last_parameters)
  final_validation_acc = predict(x_validation, y_validation, last_parameters)
  print("Dropout Final Train Accuracy: {:.3f}".format(final_train_acc*100))
  print("Dropout Final Validation Accuracy: {:.3f}".format(final_validation_acc*100))
  return last_parameters, costs

In [None]:
np.random.seed(42)
print("Start Dropout Training")
trained_params_dropout, costs_dropout = L_layer_model_dropout(x_train.T, y_train.T, layer_dims, 0.009, 10000, 64, 0.8)

training examples:  48000
validation examples:  12000
==>...................................................................................................
training iteration: 100/10000
validation cost:  2.2266398088147015
------------------------------------------------------------------------------------------------------
===>..................................................................................................
training iteration: 200/10000
validation cost:  2.1516442111213756
------------------------------------------------------------------------------------------------------
====>.................................................................................................
training iteration: 300/10000
validation cost:  2.0970634081000257
------------------------------------------------------------------------------------------------------
=====>................................................................................................
training iteration: 400/

In [None]:
test_acc_dropout = predict(x_test.T, y_test.T, trained_params_dropout) * 100
print("Dropout Final Test Accuracy: {:.3f}".format(test_acc_dropout))

Final Test Accuracy: 67.650
