In [1]:
from dataclasses import replace
import numpy as np
from crater import Tensor, Layer, Gradients
from cifar10 import Classifier
from cifar10.data import load_batch, make_normalizer, vector_to_image

from tqdm.auto import trange

In [2]:
data = load_batch("../data/data_batch_1")
data["features"] = data["features"][:, :20]
data.keys()

dict_keys(['features', 'labels'])

In [3]:
classifier = Classifier.from_dims([20, 20, 10], normalize=make_normalizer(data["features"]))

regularization = 0
weights_list = [layer.weights.data for layer in classifier.layers]
biases_list = [layer.biases.data for layer in classifier.layers]

def compute_loss(weights, biases):
    return Classifier(
        layers=[
            Layer(
                weights=Tensor.from_numpy(w),
                biases=Tensor.from_numpy(b),
                activation=lambda tensor: tensor.clip(low=0)
            )
            for w, b in zip(weights, biases)
        ],
        normalize=classifier.normalize,
    ).loss(data, regularization=regularization).data

compute_loss(weights_list, biases_list)

2.3665220054465768

In [4]:
# code given in assignment
def ComputeGradsNumSlow(W, b, h):
    """
    Compute gradients numerically via the centred difference method
    :param W: list of nxm numpy arrays, containing (in order) the weights of the first layer, then the second, etc.
    :param b: list of nx1 numpy arrays, containing (in order) the biases of the first layer, then the second, etc.
    :param h: float, step size for numerical analysis
    :return: list of numpy arrays grad_W, grad_b, of same format as W and b, respectively
    """
    grad_W = [np.zeros(w_n.shape) for w_n in W]
    grad_b = [np.zeros(b_n.shape) for b_n in b]

    for j in trange(len(grad_b)):
        bj_size = grad_b[j].shape[0]
        for i in range(bj_size):
            b_try = [bj.copy() for bj in b]
            b_try[j][i] -= h
            c1 = compute_loss(W, b_try)

            b_try = [bj.copy() for bj in b]
            b_try[j][i] += h
            c2 = compute_loss(W, b_try)

            grad_b[j][i] = (c2 - c1) / (2*h)

    for j in trange(len(grad_W)):
        Wj = grad_W[j]
        for i in range(Wj.shape[0]):
            for k in range(Wj.shape[1]):
                W_try = [wj.copy() for wj in W]
                W_try[j][i,k] -= h
                c1 = compute_loss(W_try, b)

                W_try = [wj.copy() for wj in W]
                W_try[j][i,k] += h
                c2 = compute_loss(W_try, b)
                grad_W[j][i,k] = (c2 - c1) / (2*h)

    return grad_W, grad_b


numeric_gradients_w, numeric_gradients_b = ComputeGradsNumSlow(weights_list, biases_list, h=1e-5)
analytical_gradients = classifier.gradients(data, regularization=regularization)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def relative_error(a, b, eps=1e-6):
    return np.max(np.abs(a - b) / np.maximum(eps, np.abs(a) + np.abs(b)))

[
    relative_error(numeric, analytical_gradients[layer.weights])
    for numeric, layer in zip(numeric_gradients_w, classifier.layers)
]

[0.00040923705736977664, 0.0008832403949661394]

In [6]:
[
    relative_error(numeric, analytical_gradients[layer.biases])
    for numeric, layer in zip(numeric_gradients_b, classifier.layers)
]

[0.0012928717386260142, 0.00034767456762201777]