In [199]:
whole_text = open('../data/tinyshakespeare.txt', 'r').read()
lines = whole_text.splitlines()
len(lines)


40000

In [200]:
vocab = sorted(set(''.join(whole_text)))
vocab_size = len(vocab)
vocab_size, ''.join(vocab)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [201]:
# tokenizer
stoi = {c:i for i,c in enumerate(vocab)}
itos = {v:k for k,v in stoi.items()}
encode = lambda str: [stoi[c] for c in str]
decode = lambda ints: ''.join([itos[i] for i in ints])


In [202]:
import torch

In [203]:
data = torch.tensor(encode(whole_text))
data.shape, data.dtype

(torch.Size([1115394]), torch.int64)

In [204]:
n = int(data.shape[0] * .9)
train_data = data[:n]
val_data = data[n:]
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [205]:
block_size = 8
train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [206]:
# time (T) apparently (or T as in Token?)
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(len(x),):
  context = x[:t+1]
  print(f'{context} -> {y[t]}')

# this is apparently called T (time) dimension? or Token? 
# i think from the BTC acronym we'll see more of later

tensor([18]) -> 47
tensor([18, 47]) -> 56
tensor([18, 47, 56]) -> 57
tensor([18, 47, 56, 57]) -> 58
tensor([18, 47, 56, 57, 58]) -> 1
tensor([18, 47, 56, 57, 58,  1]) -> 15
tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58


In [207]:
torch.manual_seed(1337)
# batch (B)
block_size = 8
batch_size = 4

def get_batch(split):
  data = train_data if split == 'train' else val_data
  # get offset starts for all batches
  ix = torch.randint(0, data.shape[0]-block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

xb,yb = get_batch('train')
xb,xb.shape, yb, yb.shape

(tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]]),
 torch.Size([4, 8]),
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
         [53, 56,  1, 58, 46, 39, 58,  1],
         [58,  1, 58, 46, 39, 58,  1, 46],
         [17, 27, 10,  0, 21,  1, 54, 39]]),
 torch.Size([4, 8]))

In [208]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

n_embd = 32

class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x) # (B,T,C)
    # note: not sure which this square root came from hmmm, we may need to recheck the paper
    wei = k @ q.transpose(-2,-1) * C**-0.5 
    # decoder
    # note: review this indexing again [:T,:T]
    wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x) 
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
  
  def forward(self, x):
    out =  torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embd, 4 * n_embd),
      nn.ReLU(),
      # projection
      nn.Linear(4 * n_embd, n_embd),
    )

  def forward(self,x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)

  def forward(self,x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x

class Bigram(nn.Module):
  def __init__(self,):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    self.blocks = nn.Sequential(
      Block(n_embd, n_head=4),
      Block(n_embd, n_head=4),
      Block(n_embd, n_head=4),
    )

    self.lm_head = nn.Linear(n_embd, vocab_size)
  
  def forward(self, idx, targets=None):
    B,T = idx.shape
    # idx = (B,T) 
    tok_emb = self.token_embedding_table(idx) # -> (B, T, n_embd)
    pos_emb = self.position_embedding_table(torch.arange(T))
    x = tok_emb + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # -> (B, T, vocab_size)

    if targets is None:
      return logits, None

    B,T,C = logits.shape
    targets = targets.view(B*T)
    loss = F.cross_entropy(logits.view(B*T,C), targets)
    return logits, loss
  
  def generate(self, idx, max_tokens=10):
    for _ in range(max_tokens):
      idx_inrange = idx[:,-block_size:]
      logits, loss = self(idx_inrange)
      # take the last T as this contains the predictions for next char
      logits = logits[:, -1, :] # (B,T,C) -> (B,C)
      probs = F.softmax(logits, dim=-1)
      hit = torch.multinomial(probs, num_samples=1) # note: this fn returns indices
      idx = torch.cat((idx, hit), dim=1)
    return idx


m = Bigram()
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.shape)

infe = m.generate(torch.zeros((1,1), dtype=torch.long), max_tokens=10)[0]
print(decode(infe.tolist()))


torch.Size([4, 8, 65])
torch.Size([])

?
fq:zj?dB


In [209]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [210]:
batch_size=32
for _ in range(10_000):
  xb,yb = get_batch("train")
  logits,loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss)

tensor(1.8891, grad_fn=<NllLossBackward0>)


In [211]:
print(loss.item())

1.8890974521636963


In [212]:
infe = m.generate(torch.zeros((1,1), dtype=torch.long), max_tokens=1023)[0]
print(decode(infe.tolist()))


CANTIONE:
Nay unlic;
And peavend,
Is there of may: but like eye.
O make be head,
And lawn, be!
Why, a bloid 'I?
I shood be forth
Coideepree; as serving of for will it as my somen thou head
To Client: bone
MAWI:

Edward.

JULIO:
How miert hust,
To cick and tritalt, think meady
Shall I noot:
Comed frauself wome's put the LAnchitice
And had as admose: nep, but to her bestratice more you matteat o'a celts;
Not my heoplonourt Not no my plaid, by the doe? not at I some.

PELOUCHESS:
Westit all be a ferea, and of but ou with and of I come to bence's flauded can I what youst sight.

POLIFFOLYCUS:
I cute-
Are as mother you of Irom!

POMPIFFO:
Why, wom grave afferather me my the some of my pruch,
The mery so by his enting his I for, nest just you sue of dight,
Thou any my him defults, do forthorparlas:
Ay, my not, my son
Rom sover officious we do you, scominers! hy kink youners, too I drope.

Thy know the piermas, anto now's of Ell a prome wich pesick in a no he demans: but Anger shord in it re