## Playground for tinygrad
https://github.com/geohot/tinygrad/blob/91a352a8e2697828a4b1eafa2bdc1a9a3b7deffa/tinygrad/tensor.py

In [2]:
def fetch_mnist():
    def fetch(url):
        import requests, gzip, os, hashlib, numpy
        fp = os.path.join("/tmp", hashlib.md5(url.encode('utf-8')).hexdigest())
        if os.path.isfile(fp):
            with open(fp, "rb") as f:
                dat = f.read()
        else:
            with open(fp, "wb") as f:
                dat = requests.get(url).content
                f.write(dat)
        return numpy.frombuffer(gzip.decompress(dat), dtype=numpy.uint8).copy()

    X_train = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    Y_train = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:]
    X_test = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
    Y_test = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
    return X_train, Y_train, X_test, Y_test

In [3]:
# inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py
from functools import partialmethod
import numpy as np


# **** start with two base classes ****

class Tensor:
    def __init__(self, data):
        #print(type(data), data)
        if type(data) != np.ndarray:
            print("error constructing tensor with %r" % data)
            assert (False)
        self.data = data
        self.grad = None

        # internal variables used for autograd graph construction
        self._ctx = None

    def __str__(self):
        return "Tensor %r with grad %r" % (self.data, self.grad)

    def backward(self, allow_fill=True):
        #print("running backward on", self)
        if self._ctx is None:
            return

        if self.grad is None and allow_fill:
            # fill in the first grad with one
            # this is "implicit gradient creation"
            assert self.data.size == 1
            self.grad = np.ones_like(self.data)

        assert (self.grad is not None)

        grads = self._ctx.backward(self._ctx, self.grad)
        if len(self._ctx.parents) == 1:
            grads = [grads]
        for t, g in zip(self._ctx.parents, grads):
            if g.shape != t.data.shape:
                print("grad shape must match tensor shape in %r, %r != %r" %
                      (self._ctx, g.shape, t.data.shape))
                assert (False)
            t.grad = g
            t.backward(False)

    def mean(self):
        div = Tensor(np.array([1 / self.data.size]))
        return self.sum().mul(div)


# An instantiation of the Function is the Context
class Function:
    def __init__(self, *tensors):
        self.parents = tensors
        self.saved_tensors = []

    def save_for_backward(self, *x):
        self.saved_tensors.extend(x)

    # note that due to how partialmethod works, self and arg are switched
    def apply(self, arg, *x):
        ctx = arg(self, *x)
        ret = Tensor(arg.forward(ctx, self.data, *[t.data for t in x]))
        ret._ctx = ctx
        return ret


def register(name, fxn):
    setattr(Tensor, name, partialmethod(fxn.apply, fxn))


# **** implement a few functions ****

class Mul(Function):
    @staticmethod
    def forward(ctx, x, y):
        ctx.save_for_backward(x, y)
        return x * y

    @staticmethod
    def backward(ctx, grad_output):
        x, y = ctx.saved_tensors
        return y * grad_output, x * grad_output


register('mul', Mul)


class Add(Function):
    @staticmethod
    def forward(ctx, x, y):
        return x + y

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, grad_output


register('add', Add)


class ReLU(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return np.maximum(input, 0)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.copy()
        grad_input[input < 0] = 0
        return grad_input


register('relu', ReLU)


class Dot(Function):
    @staticmethod
    def forward(ctx, input, weight):
        ctx.save_for_backward(input, weight)
        return input.dot(weight)

    @staticmethod
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        grad_input = grad_output.dot(weight.T)
        grad_weight = grad_output.T.dot(input).T
        return grad_input, grad_weight


register('dot', Dot)


class Sum(Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return np.array([input.sum()])

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        return grad_output * np.ones_like(input)


register('sum', Sum)


class LogSoftmax(Function):
    @staticmethod
    def forward(ctx, input):
        def logsumexp(x):
            #return np.log(np.exp(x).sum(axis=1))
            c = x.max(axis=1)
            return c + np.log(np.exp(x - c.reshape((-1, 1))).sum(axis=1))

        output = input - logsumexp(input).reshape((-1, 1))
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        output, = ctx.saved_tensors
        return grad_output - np.exp(output) * grad_output.sum(axis=1).reshape((-1, 1))


register('logsoftmax', LogSoftmax)


In [4]:
import numpy as np


def layer_init(m, h):
    ret = np.random.uniform(-1., 1., size=(m, h)) / np.sqrt(m * h)
    return ret.astype(np.float32)


class SGD:
    def __init__(self, tensors, lr):
        self.tensors = tensors
        self.lr = lr

    def step(self):
        for t in self.tensors:
            t.data -= self.lr * t.grad

In [5]:
#!/usr/bin/env python
# import numpy as np
# from tinygrad.tensor import Tensor
# from tinygrad.nn import layer_init, SGD
# from tinygrad.utils import fetch_mnist

from tqdm import trange

# load the mnist dataset

X_train, Y_train, X_test, Y_test = fetch_mnist()


# train a model

class TinyBobNet:
    def __init__(self):
        self.l1 = Tensor(layer_init(784, 128))
        self.l2 = Tensor(layer_init(128, 10))

    def forward(self, x):
        return x.dot(self.l1).relu().dot(self.l2).logsoftmax()


# optimizer


model = TinyBobNet()
optim = SGD([model.l1, model.l2], lr=0.01)

BS = 128
losses, accuracies = [], []
for i in (t := trange(1000)):
    samp = np.random.randint(0, X_train.shape[0], size=(BS))

    x = Tensor(X_train[samp].reshape((-1, 28 * 28)))
    Y = Y_train[samp]
    y = np.zeros((len(samp), 10), np.float32)
    y[range(y.shape[0]), Y] = -1.0
    y = Tensor(y)

    # network
    outs = model.forward(x)

    # NLL loss function
    loss = outs.mul(y).mean()
    loss.backward()
    optim.step()

    cat = np.argmax(outs.data, axis=1)
    accuracy = (cat == Y).mean()

    # printing
    loss = loss.data
    losses.append(loss)
    accuracies.append(accuracy)
    t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))


# evaluate
def numpy_eval():
    Y_test_preds_out = model.forward(Tensor(X_test.reshape((-1, 28 * 28))))
    Y_test_preds = np.argmax(Y_test_preds_out.data, axis=1)
    return (Y_test == Y_test_preds).mean()


accuracy = numpy_eval()
print("test set accuracy is %f" % accuracy)
assert accuracy > 0.95

loss 0.02 accuracy 0.95: 100%|██████████| 1000/1000 [00:07<00:00, 135.49it/s]


test set accuracy is 0.962500


In [6]:
model.l1.data

array([[ 1.97946676e-03,  1.26960746e-04,  2.38204352e-03, ...,
         1.62725593e-03, -1.91289769e-03,  5.89614210e-04],
       [-1.50905410e-03, -8.47915944e-04, -1.29279113e-04, ...,
         1.60180428e-03, -1.19924149e-03, -3.13649699e-03],
       [ 2.91572395e-03, -1.64959207e-03,  1.55070925e-03, ...,
         3.03992396e-03,  2.81632203e-03,  2.72833928e-03],
       ...,
       [ 2.09908793e-03,  2.37899576e-03, -1.70894340e-03, ...,
         9.52412665e-04,  1.39679445e-03, -1.39341236e-03],
       [ 2.95553752e-03,  1.46551232e-03, -2.74204922e-05, ...,
        -1.86793460e-03, -2.42592511e-03, -1.90780370e-03],
       [-1.18676934e-03, -2.62226723e-03, -1.19277436e-04, ...,
         1.63433055e-04, -1.38105033e-03,  3.04675312e-03]], dtype=float32)