In [1]:
from tinygrad import Tensor, nn, TinyJit
import numpy as np

In [2]:
corpus = open("shakespeare.txt").read()

In [3]:
print(len(corpus), "chars")
print(corpus[:1000])

1115394 chars
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst 

In [4]:
vocab = sorted(list(set(corpus)))
vocab_size = len(vocab)
vocab_size, "".join(vocab)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [5]:
new_line_char = "\n"

In [6]:
encode = lambda s: [vocab.index(c) for c in s]
decode = lambda l: "".join([vocab[i] for i in l])

decode(encode("hello"))

'hello'

In [7]:
data = Tensor(encode(corpus))
split = int(0.9 * len(data))
train_data = data[:split]
test_data = data[split:]

In [8]:
def get_batch(data: Tensor, batch_size, block_size):
  indices = Tensor.randint((batch_size,), high=len(data) - block_size).reshape(
    (batch_size, 1)
  ) + Tensor.arange(block_size)
  return data[indices], data[indices + 1]

In [9]:
x, y = get_batch(train_data, batch_size=4, block_size=8)
x.shape, y.shape

((4, 8), (4, 8))

In [10]:
print([decode(row) for row in x.numpy()])
print([decode(row) for row in y.numpy()])

['\nAnd fro', 'brazen w', 'my soldi', 'ng his f']
['And from', 'razen wa', 'y soldie', 'g his fo']


In [11]:
class Bigram:
  def __init__(self, vocab_size: int):
    assert vocab_size >= 1
    self.vocab_size = vocab_size
    self.embed = nn.Embedding(vocab_size, vocab_size)

  def __call__(self, x: Tensor) -> Tensor:
    assert len(x.shape) == 1
    return self.embed(x.reshape((-1, 1))).squeeze(1)

  def loss(self, logits: Tensor, y: Tensor) -> Tensor:
    assert (
      len(logits.shape) == 2
      and len(y.shape) == 1
      and logits.shape[0] == y.shape[0]
      and logits.shape[1] == self.vocab_size
    )
    return logits.sparse_categorical_crossentropy(y)

  def generate(self, x: Tensor, max_len=50):
    with Tensor.inference_mode():
      for _ in range(max_len):
        prev_x = x[-1:]
        p = self(prev_x).squeeze().softmax().numpy()
        next_x = Tensor([np.random.choice(vocab_size, p=p)])
        x = x.cat(next_x)
    return x

In [12]:
bigram = Bigram(vocab_size)


In [13]:
optim = nn.optim.AdamW(nn.state.get_parameters(bigram))
batch_size = 128

@TinyJit
@Tensor.train()
def train_step():
  optim.zero_grad()
  x_samples, y_samples = get_batch(train_data, batch_size, block_size=1)
  loss = bigram.loss(bigram(x_samples.reshape(-1)), y_samples.reshape(-1)).backward()
  optim.step()
  return loss

In [14]:
losses = []
for step in range(1, 20001):
  loss = train_step().item()
  losses.append(loss)
  if step == 1 or step % 1000 == 0:
    with Tensor.inference_mode():
      acc = (bigram(test_data[:-1]).argmax(axis=1) == test_data[1:]).mean().item()
      print(f"step {step}, loss {loss:.2f}, acc {acc*100.:.2f}%")

step 1, loss 4.19, acc 1.83%
step 1000, loss 3.41, acc 22.44%
step 2000, loss 2.93, acc 26.31%
step 3000, loss 2.73, acc 26.89%
step 4000, loss 2.69, acc 26.93%
step 5000, loss 2.69, acc 26.91%
step 6000, loss 2.63, acc 26.91%
step 7000, loss 2.51, acc 26.95%
step 8000, loss 2.52, acc 26.97%
step 9000, loss 2.66, acc 26.91%
step 10000, loss 2.63, acc 27.01%
step 11000, loss 2.27, acc 26.98%
step 12000, loss 2.46, acc 26.74%
step 13000, loss 2.36, acc 27.01%
step 14000, loss 2.36, acc 26.99%
step 15000, loss 2.41, acc 26.98%
step 16000, loss 2.60, acc 26.97%
step 17000, loss 2.49, acc 26.97%
step 18000, loss 2.44, acc 27.01%
step 19000, loss 2.39, acc 27.01%
step 20000, loss 2.41, acc 27.01%


In [18]:
print(decode(bigram.generate(Tensor([vocab.index(new_line_char)]), max_len=500).numpy()))


O:
Matr deneas, Doule angh ho busor VINGUSiserr ck o y te the! Balty d y Gr itre br el mecineset Momen merdgo kere VCOfan'for, s?
DY ad I I fo mans
DUMBEWhin.
Whathetrofe hid ate
I:
LELAnto, s torce,
MINUhillvorthend,
Serofofe Bilictlas o an!
HAnosepofowir.felint A lonye moupunonour ntom l; gsow hemelion torthy s f me aperve k fe ashoJULe f

Dus tie leis t bllerd thanouprey, o ing atid saimangore t tatate, manecha it muis, h hel
Thende lk'd stheeaie, at areret
Thepes:
Ser reas I.

ENTou lin' l f


In [165]:
B, T, C = 1000, 100, 200
x = Tensor.randint((B, T, C))

def using_cumsum():
  a = x.cumsum(axis=1)
  b = a / Tensor.arange(1, T+1).reshape((T, 1))


def using_matmul():
  a = Tensor.ones((T, T)).tril() @ x
  b = a / Tensor.arange(1, T+1).reshape((T, 1))


def using_softmax():
  a = Tensor.ones((T, T)).tril().where(0, float("-inf")).softmax()
  b = a @ x

In [167]:
import timeit

np.mean(timeit.repeat(using_cumsum, repeat=10000, number=1)) * 1000, np.mean(timeit.repeat(using_matmul, repeat=10000, number=1)) * 1000, np.mean(timeit.repeat(using_softmax, repeat=10000, number=1)) * 1000

(np.float64(0.7295794918107276),
 np.float64(0.935482974491606),
 np.float64(1.0409363099006441))