In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tinygrad import Tensor, nn, TinyJit
import numpy as np
import random
import math

In [3]:
def build_dataset():
  data = []
  for i in range(100):
    for j in range(100):
      s = i + j
      data.append([i // 10, i % 10, j // 10, j % 10, s // 100, (s // 10) % 10, s % 10])
  random.shuffle(data)
  data = Tensor(data)
  X_train = data[:8000, :-1]
  Y_train = data[:8000, 1:]
  X_test = data[8000:, :-1]
  Y_test = data[8000:, 1:]
  return X_train, Y_train, X_test, Y_test

In [4]:
X_train, Y_train, X_test, Y_test = build_dataset()

X_train, Y_train, X_test, Y_test

(<Tensor <LB METAL (8000, 6) int ShapeTracker(views=(View(shape=(8000, 6), strides=(7, 1), offset=0, mask=None, contiguous=False),))> on METAL with grad None>,
 <Tensor <LB METAL (8000, 6) int ShapeTracker(views=(View(shape=(8000, 6), strides=(7, 1), offset=1, mask=None, contiguous=False),))> on METAL with grad None>,
 <Tensor <LB METAL (2000, 6) int ShapeTracker(views=(View(shape=(2000, 6), strides=(7, 1), offset=56000, mask=None, contiguous=False),))> on METAL with grad None>,
 <Tensor <LB METAL (2000, 6) int ShapeTracker(views=(View(shape=(2000, 6), strides=(7, 1), offset=56001, mask=None, contiguous=False),))> on METAL with grad None>)

In [5]:
class Attention:
  def __init__(self, n_heads, embed_size) -> None:
    self.n_heads = n_heads
    self.internal_embed_size = embed_size * 2
    bound = 1 / math.sqrt(self.internal_embed_size)
    self.keys = Tensor.uniform(
      n_heads, embed_size, self.internal_embed_size, low=-bound, high=bound
    )
    self.queries = Tensor.uniform(
      n_heads, embed_size, self.internal_embed_size, low=-bound, high=bound
    )
    self.values = Tensor.uniform(
      n_heads, embed_size, self.internal_embed_size, low=-bound, high=bound
    )
    self.linear = Tensor.uniform(
      n_heads * self.internal_embed_size, embed_size, low=-bound, high=bound
    )

  def __call__(self, x: Tensor) -> Tensor:
    B, T, C = x.shape

    x = x.unsqueeze(1).expand((B, self.n_heads, T, C))

    K = x @ self.keys
    Q = x @ self.queries
    V = x @ self.values

    # dot_attn = Q @ K.transpose(-2, -1)
    # scaled_dot_attn = dot_attn / math.sqrt(self.attn_embed_size)
    # masked_scaled_dot_attn = scaled_dot_attn#.tril().where(scaled_dot_attn, float("-inf"))
    # attn_scores = masked_scaled_dot_attn.softmax()

    # ret = attn_scores @ V
    # return ret

    ret = Tensor.scaled_dot_product_attention(
      K, Q, V, attn_mask=Tensor.ones((T, T)).tril()
    )
    ret = ret.reshape((B, T, self.n_heads * self.internal_embed_size))
    ret = ret @ self.linear
    ret = ret.gelu()

    return ret


class Transformer:
  def __init__(self, vocab_size, embed_size, n_layers, n_heads) -> None:
    self.token_embed = nn.Embedding(vocab_size, embed_size)
    self.h = [Attention(n_heads, embed_size) for _ in range(n_layers)]
    self.linear = nn.Linear(embed_size, vocab_size)

  def forward(self, x: Tensor) -> Tensor:
    logits = x.sequential([self.token_embed, *self.h, self.linear])
    return logits

  def loss(self, x: Tensor, y: Tensor) -> Tensor:
    logits = self.forward(x)
    loss = logits.sparse_categorical_crossentropy(y)
    return logits, loss

  def __call__(self, x: Tensor) -> Tensor:
    logits = self.forward(x)
    return logits[:, -1, :]

In [6]:
model = Transformer(vocab_size=10, embed_size=128, n_layers=2, n_heads=2)
sum(p.numel() for p in nn.state.get_parameters(model))

526858

In [7]:
optim = nn.optim.AdamW(nn.state.get_parameters(model))
batch_size = 128


@TinyJit
@Tensor.train()
def train_step():
  optim.zero_grad()
  samples = Tensor.randint(batch_size, high=X_train.shape[0])
  X_samples, Y_samples = X_train[samples], Y_train[samples]
  _, loss = model.loss(X_samples, Y_samples)
  loss.backward()
  optim.step()
  return loss

In [8]:
for step in range(1, 4001):
  loss = train_step()
  if step == 1 or step % 250 == 0:
    with Tensor.inference_mode():
      acc = (model(X_test).argmax(axis=-1) == Y_test[:, -1]).mean().item()
      print(f"step {step}, loss {loss.item():.2f}, acc {acc*100.:.2f}%")

step 1, loss 2.30, acc 9.45%
step 250, loss 0.59, acc 21.40%
step 500, loss 0.27, acc 67.45%
step 750, loss 0.11, acc 91.10%
step 1000, loss 0.03, acc 97.15%
step 1250, loss 0.03, acc 98.10%
step 1500, loss 0.01, acc 98.65%
step 1750, loss 0.02, acc 98.75%
step 2000, loss 0.01, acc 98.75%
step 2250, loss 0.00, acc 98.95%
step 2500, loss 0.01, acc 98.70%
step 2750, loss 0.02, acc 97.95%
step 3000, loss 0.01, acc 99.55%
step 3250, loss 0.00, acc 99.65%
step 3500, loss 0.03, acc 99.35%
step 3750, loss 0.00, acc 99.35%
step 4000, loss 0.00, acc 99.10%


In [9]:
predictions = model(X_test).argmax(axis=-1)
true_labels = Y_test[:, -1]
incorrect_mask = predictions != true_labels
incorrect_indices = np.where(incorrect_mask.numpy())[0].tolist()

for index in incorrect_indices:
  print(
    f"Example: {X_test[index].tolist()}, Prediction: {predictions[index].item()}, True: {true_labels[index].item()}"
  )

Example: [1, 0, 0, 0, 0, 1], Prediction: 1, True: 0
Example: [0, 3, 2, 1, 0, 2], Prediction: 3, True: 4
Example: [0, 3, 2, 0, 0, 2], Prediction: 2, True: 3
Example: [4, 0, 0, 0, 0, 4], Prediction: 4, True: 0
Example: [9, 0, 0, 0, 0, 9], Prediction: 9, True: 0
Example: [8, 7, 0, 0, 0, 8], Prediction: 5, True: 7
Example: [8, 0, 0, 0, 0, 8], Prediction: 8, True: 0
Example: [0, 9, 3, 0, 0, 3], Prediction: 0, True: 9
Example: [9, 6, 0, 3, 0, 9], Prediction: 1, True: 9
Example: [0, 0, 8, 0, 0, 8], Prediction: 8, True: 0
Example: [8, 8, 0, 0, 0, 8], Prediction: 6, True: 8
Example: [0, 0, 0, 0, 0, 0], Prediction: 7, True: 0
Example: [0, 0, 0, 3, 0, 0], Prediction: 7, True: 3
Example: [0, 3, 5, 0, 0, 5], Prediction: 2, True: 3
Example: [0, 9, 4, 1, 0, 5], Prediction: 1, True: 0
Example: [1, 7, 0, 0, 0, 1], Prediction: 8, True: 7
Example: [0, 3, 0, 1, 0, 0], Prediction: 3, True: 4
Example: [3, 0, 9, 3, 1, 2], Prediction: 2, True: 3
