In [1]:
from dataclasses import replace
import numpy as np
from crater import Tensor, Gradients
from crater.premade import Classifier, Layer, BatchNormalization
from cifar10.data import load_batch, make_normalizer, vector_to_image

from tqdm.auto import trange

In [2]:
data = load_batch("../data/data_batch_1")
data["features"] = data["features"][:, :20]
data.keys()

dict_keys(['features', 'labels'])

In [3]:
classifier = Classifier.from_dims([20, 20, 20, 20, 10], normalize=make_normalizer(data["features"]))

regularization = 0
arrays = dict(
    weights=[layer.weights.data for layer in classifier.layers],
    biases=[layer.biases.data for layer in classifier.layers],
    shifts=[norm.shift.data for norm in classifier.batch_normalizations[:-1]],
    scales=[norm.scale.data for norm in classifier.batch_normalizations[:-1]],
)

def compute_loss(weights=arrays["weights"], biases=arrays["biases"], shifts=arrays["shifts"], scales=arrays["scales"]):
    batch_norms = [
        BatchNormalization(
            mean=np.zeros(shift.shape),
            variance=np.ones(scale.shape),
            persistence=0.9,
            shift=shift,
            scale=scale,
        )
        for shift, scale in zip(shifts, scales)
    ] + [None]
    return Classifier(
        layers=[
            Layer(
                weights=Tensor.from_numpy(w),
                biases=Tensor.from_numpy(b),
                activation=Classifier._make_activation(batch_norm),
            )
            for w, b, batch_norm in zip(weights, biases, batch_norms)
        ],
        batch_normalizations=batch_norms,
        normalize=classifier.normalize,
    ).loss(data, regularization=regularization).data

with BatchNormalization.mode("test"):
    print(compute_loss())

2.4443435765147163


In [4]:
def numeric_gradients(keyword, arrays, h=1e-5):
    result = [np.zeros_like(array) for array in arrays]
    for j in trange(len(arrays)):
        for i in range(np.prod(arrays[j].shape)):
            idx = np.unravel_index(i, arrays[j].shape)
            
            attempt = [array.copy() for array in arrays]
            attempt[j][idx] -= h
            c1 = compute_loss(**{keyword: attempt})
            
            attempt = [array.copy() for array in arrays]
            attempt[j][idx] += h
            c2 = compute_loss(**{keyword: attempt})

            result[j][idx] = (c2 - c1) / (2 * h)
    return result


with BatchNormalization.mode("test"):
    numeric_gradients = {
        keyword: numeric_gradients(keyword, arrays)
        for keyword, arrays in arrays.items()
    }
    analytical_gradients = classifier.gradients(data, regularization=regularization)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def relative_error(a, b, eps=1e-6):
    values = np.abs(a - b) / np.maximum(eps, np.abs(a) + np.abs(b))
    return dict(
        mean=np.mean(values),
        max=np.max(values),
    )

[
    relative_error(numeric, analytical_gradients[layer.weights])
    for numeric, layer in zip(numeric_gradients["weights"], classifier.layers)
]

[{'mean': 1.5877386601338356e-05, 'max': 0.0006999944024035659},
 {'mean': 3.5747771864188587e-06, 'max': 0.0002312662656286539},
 {'mean': 8.193142753863375e-05, 'max': 0.007720698674401974},
 {'mean': 1.36996441370411e-06, 'max': 3.533130105981164e-05}]

In [6]:
[
    relative_error(numeric, analytical_gradients[layer.biases])
    for numeric, layer in zip(numeric_gradients["biases"], classifier.layers)
]

[{'mean': 0.0002759063443307716, 'max': 0.003990483975207583},
 {'mean': 5.845346271049524e-05, 'max': 0.0008270757042837316},
 {'mean': 0.0001499878319032988, 'max': 0.0019210507814969061},
 {'mean': 2.317086649516884e-10, 'max': 4.4169532397067386e-10}]

In [7]:
[
    relative_error(numeric, analytical_gradients[norm.shift])
    for numeric, norm in zip(numeric_gradients["shifts"], classifier.batch_normalizations)
]

[{'mean': 0.0002759063443307716, 'max': 0.003990483975207583},
 {'mean': 5.845346271049524e-05, 'max': 0.0008270757042837316},
 {'mean': 0.0001499878319032988, 'max': 0.0019210507814969061}]

In [8]:
[
    relative_error(numeric, analytical_gradients[norm.scale])
    for numeric, norm in zip(numeric_gradients["scales"], classifier.batch_normalizations)
]

[{'mean': 3.1504980157057612e-06, 'max': 2.192519498731107e-05},
 {'mean': 3.958553650560632e-06, 'max': 3.704633743367314e-05},
 {'mean': 2.2492458787036813e-06, 'max': 2.8993886798153917e-05}]