# Numerical Gradient Checking

We would highly recommend looking at `neural_networks.grad_check.check_gradients` and making sure you understand how numerical gradient checking is being carried out. This function is used in the notebook to check the gradients of the neural network layers you write. Make sure to check the gradient of a layer after finishing its implementation.

The function returns the relative error of the numerical gradient (approximated using finite differences) with respect to the analytical gradient (computed via backpropagation). Correct implementations should get very small errors, usually less than `1e-8` for 64-bit float matrices (the default).

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils.grad_check import check_gradients
from neural_networks.layers import FullyConnected
from neural_networks.activations import Identity, Sigmoid, TanH, ReLU, SoftMax

## Gradient Checks for Activation Functions

### Identity Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
identity_activation = Identity()
_ = identity_activation.forward(X)
grad = identity_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for identity activation:",
    check_gradients(
        fn=identity_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Sigmoid Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
sigmoid_activation = Sigmoid()
_ = sigmoid_activation.forward(X)
grad = sigmoid_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for sigmoid activation:",
    check_gradients(
        fn=sigmoid_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Tanh Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
tanh_activation = TanH()
_ = tanh_activation.forward(X)
grad = tanh_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for tanh activation:",
    check_gradients(
        fn=tanh_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### ReLU Activation

In [2]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for relu activation: 1.434512569475495e-11


### Softmax Activation

In [56]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

Relative error for softmax activation: 4.081933724651398e-11


In [54]:
np.dot(np.array([-0.95770902, -1.13035283, -0.59303436]), np.array([0.39816798, 0.32755455, 0.27427747]))

-0.9142372423769254

In [9]:
import math
math.exp(1)

2.718281828459045

In [15]:
1/(math.exp(-5) + math.exp(-4) + math.exp(-3))

13.36172178003184

In [18]:
13.36172178003184 * math.exp(-3)

0.6652409557748219

## Gradient Checks for Full Layers

### Fully Connected Layer

In [20]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="identity")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

Relative error for W: 2.7367051042921758e-11
Relative error for b: 4.0608046578013836e-11


### Cross Entropy

In [27]:
from neural_networks.losses import CrossEntropy

num_pts = 5
num_classes = 6

# one-hot encoded y
y_idxs = np.random.randint(0, num_classes, (num_pts,))
y = np.zeros((num_pts, num_classes))
y[range(num_pts), y_idxs] = 1

# normalized predictions
scores = np.random.uniform(0, 1, size=(num_pts, num_classes))
y_hat = scores / scores.sum(axis=1, keepdims=True)

cross_entropy_loss = CrossEntropy("cross_entropy")

def forward_fn(Y, Y_hat):    
    def inner_forward(Y_hat):
        return cross_entropy_loss.forward(Y, Y_hat)
    return inner_forward

loss = cross_entropy_loss.forward(y, y_hat)
grad = cross_entropy_loss.backward(y, y_hat)

print(
    f"Relative error for cross entropy loss:",
    check_gradients(
        fn=forward_fn(y, y_hat),  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=y_hat,        # the variable w.r.t. which we are taking the gradient
        dLdf=1,  # gradient at previous layer
    )
)

Relative error for cross entropy loss: 4.8903201844751053e-11


In [22]:
Y

array([[0.09003057, 0.24472847, 0.66524096],
       [0.33333333, 0.33333333, 0.33333333]])

In [23]:
y_hat

array([[0.2497458 , 0.23554239, 0.21380396, 0.11193534, 0.1620443 ,
        0.02692821],
       [0.21489054, 0.10523732, 0.0917055 , 0.12118648, 0.2526063 ,
        0.21437386],
       [0.12360695, 0.33966647, 0.10014228, 0.27904106, 0.01956997,
        0.13797327],
       [0.40778411, 0.01751083, 0.01252566, 0.16680121, 0.27540759,
        0.1199706 ],
       [0.08691567, 0.26699982, 0.29851563, 0.01686418, 0.19137455,
        0.13933016]])

In [24]:
a = np.array([[1, 0, 0], [1, 0, 0]])

In [25]:
b = np.array([[1, 0, 0], [1, 0, 0]])

In [28]:
cross_entropy_loss.forward(a, b)

-0.0

In [29]:
np.dot(a, b)

ValueError: shapes (2,3) and (2,3) not aligned: 3 (dim 1) != 2 (dim 0)