In [1]:
import timeit

from tinygrad import Device, nn, TinyJit, Tensor
from tinygrad.nn.datasets import mnist

from timestep.config import Settings

settings = Settings()
settings.model_dump()

{'app_dir': '/home/mjschock/.config/timestep',
 'bearerinfo_func': 'timestep.api.decode_token',
 'default_hf_repo_id': 'Mozilla/TinyLlama-1.1B-Chat-v1.0-llamafile',
 'default_llamafile_host': '0.0.0.0',
 'default_llamafile_port': 8080,
 'default_model_filename': 'TinyLlama-1.1B-Chat-v1.0.F16.llamafile',
 'default_multimodal_model_projector_filename': None,
 'openai_api_key': SecretStr('**********'),
 'openai_base_url': 'http://localhost:8000/api/openai/v1',
 'openai_org_id': 'organization_id',
 'openai_project_id': 'project_id',
 'poetry_repositories_testpypi_url': 'https://test.pypi.org/legacy/',
 'poetry_virtualenvs_in_project': True,
 'poetry_virtualenvs_prefer_active_python': True,
 'prefect_api_url': 'http://127.0.0.1:4200/api',
 'prefect_logging_level': 'INFO',
 'prefect_logging_log_prints': True,
 'pyenv_version': '3.10.14',
 'verbose': True}

## MNIST Training

In [2]:
print(Device.DEFAULT)

CUDA


In [3]:
class Model:
  def __init__(self):
    self.l1 = nn.Conv2d(1, 32, kernel_size=(3,3))
    self.l2 = nn.Conv2d(32, 64, kernel_size=(3,3))
    self.l3 = nn.Linear(1600, 10)

  def __call__(self, x:Tensor) -> Tensor:
    x = self.l1(x).relu().max_pool2d((2,2))
    x = self.l2(x).relu().max_pool2d((2,2))

    return self.l3(x.flatten(1).dropout(0.5))

In [4]:
X_train, Y_train, X_test, Y_test = mnist()
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((60000, 1, 28, 28), (60000,), (10000, 1, 28, 28), (10000,))

In [5]:
model = Model()
acc = (model(X_test).argmax(axis=1) == Y_test).mean()

# NOTE: tinygrad is lazy, and hasn't actually run anything by this point
# print(acc.item())  # ~10% accuracy, as expected from a random model

print(f"Accuracy: {acc.item():.2%}")

Accuracy: 9.67%


In [6]:
optim = nn.optim.Adam(nn.state.get_parameters(model))
batch_size = 128

def step():
  Tensor.training = True  # makes dropout work
  samples = Tensor.randint(batch_size, high=X_train.shape[0])
  X, Y = X_train[samples], Y_train[samples]
  optim.zero_grad()
  loss = model(X).sparse_categorical_crossentropy(Y).backward()
  optim.step()

  return loss

In [7]:
times: list[float] = timeit.repeat(step, repeat=5, number=1)
times_avg = sum(times) / len(times)

print(f"Time per step: {times_avg:.3f} seconds")

times if settings.verbose else None

Time per step: 0.696 seconds


[2.815063425999938,
 0.48530614800006333,
 0.06252777600002446,
 0.059415125999976226,
 0.057024180999974305]

In [8]:
jit_step = TinyJit(step)

jit_times: list[float] = timeit.repeat(jit_step, repeat=5, number=1)
jit_avg_time = sum(jit_times) / len(jit_times)

assert jit_avg_time < times_avg, "JIT should be faster"

print(f"Time per JIT step: {jit_avg_time:.3f} seconds ({times_avg / jit_avg_time:.1f}x faster)")

times if settings.verbose else None

Time per JIT step: 0.045 seconds (15.4x faster)


[2.815063425999938,
 0.48530614800006333,
 0.06252777600002446,
 0.059415125999976226,
 0.057024180999974305]

In [9]:
for step in range(7000):
  loss = jit_step()

  if step%100 == 0:
    Tensor.training = False
    acc = (model(X_test).argmax(axis=1) == Y_test).mean().item()
    print(f"step {step:4d}, loss {loss.item():.2f}, acc {acc*100.:.2f}%")

    if acc > 0.92:
      break

step    0, loss 2.41, acc 73.02%
step  100, loss 0.29, acc 96.06%
