In [90]:
import numpy as np

from keras.datasets import mnist
from keras.utils import to_categorical

In [91]:
def initialize_weights(shapes, outputs):
  """Initializes weights of model according to shape.

     Args:
       shapes = [784, 300, 10]
       outputs = ["relu", "sigmoid"]
     returns:
       model with uniform random weights [-1,+1], zero bias and output function
       [
        [random(784, 300), zeros(300), "relu"]
        [random(300, 10), zeros(10), "sigmoid"] 
      ]
  """
  models = []
  for i in range(len(shapes)):
    if i is 0:
      continue

    models.append([
      np.random.uniform(-1, 1, (shapes[i - 1], shapes[i])),
      np.zeros(shapes[i]).reshape(1, shapes[i]),
      outputs[i - 1]
    ])

  return models

In [92]:
def relu(x):
  """Computes relu of function."""
  return np.maximum(0, x)

def sigmoid(x):
  """Computes sigmoid of function."""
  return 1 / (1 + np.exp(-x))

def relu_derivative(z):
  """Computes derivative of relu of function."""
  z_copy = np.array(z, copy=True)
  z_copy[z_copy <= 0] = 0
  z_copy[z_copy > 0] = 1
  return z_copy

def sigmoid_derivative(z):
  """Computes derivative of sigmoid of function."""
  sig = sigmoid(z)
  return sig * (1 - sig)

In [93]:
def activate(z, funcName):
  """Activate the given input based on activation function name
  
     Args:
       z: input tensor of shape (B, Ni)
       funcName: string of supported activation function name
     Returns:
       tensor of shape (B, Ni) after applied activation function
  """
  supported_funcs = {
    "relu": relu,
    "sigmoid": sigmoid,
    "linear": lambda x: x
  }

  if funcName not in supported_funcs:
    raise Exception("Unsupported function")

  return supported_funcs[funcName](z)
  

def forward(x, model):
  """Performs forward pass of training step.

     Args:
       x: input tensor of shape (B, Ni)
       model: list of model weights (see initialize weights)
     Returns:
       List containing dictionary { "y": y, "z": z } for each layer of network.
  """     

  # Add input as first layer, which will be consumed in backward
  layer_output = [{"z": x, "y": x}]

  for w, b, activation_func in model:
    y_prev = layer_output[-1]["y"]
    
    z = np.dot(y_prev, w) + b
    y = activate(z, activation_func)
    
    layer_output.append({"z": z, "y": y})

  return layer_output

In [94]:
def predict(x, model):
  """Predicts the output of a model.

     Args:
       x: input tensor of shape (B, Ni)
       model: list of model weights (see initialize weights)
     Returns:
       Prediction of model, with the same shape as the labeled data (B, No).
  """
  fwd = forward(x, model)
  return fwd[-1]["y"]

In [95]:
def mse(y, p):
  """Computes Mean-Square Error between y and p.
     Args:
       y: labeled data of size (B, No) 
       p: predicted label of size (B, No)
     Returns:
       MSE of y-p
  """
  return np.mean(np.square(y - p))

def mse_derivative(y, p):
  """Computes derivative of Mean-Square Error between y and p.
     Args:
       y: labeled data of size (B, No) 
       p: predicted label of size (B, No)
     Returns:
       derivative of MSE = y-p
  """
  return p - y

In [96]:
def binary_crossentropy(y, p):
  """Computes binary crossentropy between y and p.
     Args:
       y: labeled data of size (B, No) 
       p: predicted label of size (B, No)
     Returns:
       BCE of (y, p) = mean(sum(y log(p) + (1-y) log(1-p))) 
  """
  return - np.mean(
        np.multiply(y, np.log(p)) + np.multiply((1 - y), np.log(1 - p)))

def binary_crossentropy_derivative(y, p):
  """Computes derivative of binary crossentropy between y and p.
     Args:
       y: labeled data of size (B, No) 
       p: predicted label of size (B, No)
     Returns:
       derivative of BCE of (y, p) = -[y / p - (1 - y) / (1 - p)]
  """
  return p - y, / np.multiply(p, 1 - p)

SyntaxError: invalid syntax (<ipython-input-96-57055c5f3436>, line 20)

In [97]:
def get_cost(y, p, funcName, is_last_layer_error=False):
  """Calculate lost based on cost function name.
  
     Args:
       y: input tensor of shape (B, Ni)
       p: label vector of shape (B, 1)
       funcName: string of supported lost function name
     Returns:
       tensor of shape (B, Ni) after applied cost function
  """
  supported_cost_funcs = {
    "mse": mse,
    "binary_crossentropy": binary_crossentropy,
  }
    
  supported_cost_funcs_derivative = {
    "mse": mse_derivative,
    "binary_crossentropy": binary_crossentropy_derivative,
  }

  # Should also appear in supported_cost_funcs_derivative 
  if funcName not in supported_cost_funcs:
    raise Exception("Unsupported function")

  func = supported_cost_funcs_derivative if is_last_layer_error else supported_cost_funcs

  return func[funcName](y, p)

def get_activation_derivative(z, funcName):
  """Computes derivative of activation function for computing dZ.
     Args:
       z: input tensor of shape (B, Ni)
       funcName: string of supported activation function name
     Returns:
       derivative of activation function
  """
  supported_funcs = {
    "relu": relu_derivative,
    "sigmoid": sigmoid_derivative,
    "linear": lambda _: 1
  }

  if funcName not in supported_funcs:
    raise Exception("Unsupported function")

  return supported_funcs[funcName](z)

def backward(y, x, models, loss):
  """Computes backward step of training.
     Args:
       y: labeled data of size (B, No) 
       x: input tensor of shape (B, Ni)
       model: list of model weights (see initialize weights)
       loss: one of ("mse", "binary_crossentropy")
     Returns:
       tuple with loss evaluation of (y, predict(x)) and list of dictionary
       containing { "dw": dw, "db": db } for each layer of network. Remember
       that shape of dw for each layer should be equal to shape of weight for
       the same layer.
  """
  y_predicted = x[-1]["y"]
  y = y.reshape(y_predicted.shape)
  b = y.shape[0] # number of training data

  cost = get_cost(y, y_predicted, loss)
  weights = []

  dY_prev = get_cost(y, y_predicted, loss, is_last_layer_error=True)

  for i, model in reversed(list(enumerate(models))):
    dY_curr = dY_prev

    Y_prev = x[i]["y"]
    Z_curr = x[i + 1]["z"]
    W_curr, _, activation_func = model
    
    dZ_curr = get_activation_derivative(Z_curr, activation_func) * dY_curr
    dW_curr = np.dot(Y_prev.T, dZ_curr) / b
    db_curr = np.sum(dZ_curr, axis=0, keepdims=True) / b
    dY_prev = np.dot(dZ_curr, W_curr.T)

    weights.insert(0, {"dw": dW_curr, "db": db_curr})

  return (cost, weights)

In [98]:
def update(weights, dweights, alpha):
  """Gradient descent for weights and biases."""
  for i in range(len(weights)):
    weights[i][0] += - alpha * dweights[i]["dw"]
    weights[i][1] += - alpha * dweights[i]["db"]