In [14]:
whole_text = open('../data/tinyshakespeare.txt', 'r').read()
lines = whole_text.splitlines()
len(lines)


40000

In [15]:
vocab = sorted(set(''.join(whole_text)))
vocab_size = len(vocab)
vocab_size, ''.join(vocab)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [16]:
# tokenizer
stoi = {c:i for i,c in enumerate(vocab)}
itos = {v:k for k,v in stoi.items()}
encode = lambda str: [stoi[c] for c in str]
decode = lambda ints: ''.join([itos[i] for i in ints])


In [17]:
import torch

In [18]:
data = torch.tensor(encode(whole_text))
data.shape, data.dtype

(torch.Size([1115394]), torch.int64)

In [19]:
n = int(data.shape[0] * .9)
train_data = data[:n]
val_data = data[n:]
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [20]:
block_size = 8
train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [21]:
# time (T) apparently (or T as in Token?)
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(len(x),):
  context = x[:t+1]
  print(f'{context} -> {y[t]}')

# this is apparently called T (time) dimension? or Token? 
# i think from the BTC acronym we'll see more of later

tensor([18]) -> 47
tensor([18, 47]) -> 56
tensor([18, 47, 56]) -> 57
tensor([18, 47, 56, 57]) -> 58
tensor([18, 47, 56, 57, 58]) -> 1
tensor([18, 47, 56, 57, 58,  1]) -> 15
tensor([18, 47, 56, 57, 58,  1, 15]) -> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) -> 58


In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

In [23]:
#torch.manual_seed(1337)
# batch (B)
block_size = 128
batch_size = 32

def get_batch(split):
  data = train_data if split == 'train' else val_data
  # get offset starts for all batches
  ix = torch.randint(0, data.shape[0]-block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y

xb,yb = get_batch('train')
xb,xb.shape, yb, yb.shape

(tensor([[63, 53, 59,  ...,  1, 47, 58],
         [53, 59, 56,  ..., 59, 56, 43],
         [43, 52, 58,  ...,  5, 42,  1],
         ...,
         [43, 60, 43,  ...,  1, 39, 58],
         [43,  1, 58,  ..., 53, 52,  1],
         [52,  1, 39,  ..., 42,  5, 57]], device='cuda:0'),
 torch.Size([32, 128]),
 tensor([[53, 59, 56,  ..., 47, 58,  1],
         [59, 56,  1,  ..., 56, 43, 42],
         [52, 58,  1,  ..., 42,  1, 46],
         ...,
         [60, 43, 56,  ..., 39, 58, 58],
         [ 1, 58, 46,  ..., 52,  1, 58],
         [ 1, 39, 40,  ...,  5, 57,  1]], device='cuda:0'),
 torch.Size([32, 128]))

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#torch.manual_seed(1337)

n_embd = 384
dropout = 0.2

class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x) # (B,T,C)
    # note: not sure which this square root came from hmmm, we may need to recheck the paper
    wei = k @ q.transpose(-2,-1) * C**-0.5 
    # decoder
    # note: review this indexing again [:T,:T]
    wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x) 
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    out =  torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embd, 4 * n_embd),
      nn.ReLU(),
      # projection
      nn.Linear(4 * n_embd, n_embd),
      nn.Dropout(dropout)
    )

  def forward(self,x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self,x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

class Bigram(nn.Module):
  def __init__(self,):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    self.blocks = nn.Sequential(
      Block(n_embd, n_head=4),
      Block(n_embd, n_head=4),
      Block(n_embd, n_head=4),
    )

    self.ln_f = nn.LayerNorm(n_embd)

    self.lm_head = nn.Linear(n_embd, vocab_size)
  
  def forward(self, idx, targets=None):
    B,T = idx.shape
    # idx = (B,T) 
    tok_emb = self.token_embedding_table(idx) # -> (B, T, n_embd)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = tok_emb + pos_emb
    x = self.blocks(x)
    x =self.ln_f(x)
    logits = self.lm_head(x) # -> (B, T, vocab_size)

    if targets is None:
      return logits, None

    B,T,C = logits.shape
    targets = targets.view(B*T)
    loss = F.cross_entropy(logits.view(B*T,C), targets)
    return logits, loss
  
  def generate(self, idx, max_tokens=10):
    for _ in range(max_tokens):
      idx_inrange = idx[:,-block_size:]
      logits, loss = self(idx_inrange)
      # take the last T as this contains the predictions for next char
      logits = logits[:, -1, :] # (B,T,C) -> (B,C)
      probs = F.softmax(logits, dim=-1)
      hit = torch.multinomial(probs, num_samples=1) # note: this fn returns indices
      idx = torch.cat((idx, hit), dim=1)
    return idx


m = Bigram()
m = m.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.shape)

infe = m.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_tokens=10)[0]
print(decode(infe.tolist()))


torch.Size([32, 128, 65])
torch.Size([])

mXnxM,:aOJ


In [25]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [26]:
for _ in range(10_000):
  xb,yb = get_batch("train")
  logits,loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss)

tensor(1.2606, device='cuda:0', grad_fn=<NllLossBackward0>)


In [27]:
print(loss.item())

1.2606072425842285


In [28]:
infe = m.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_tokens=1023)[0]
print(decode(infe.tolist()))


For whitely Tybalt, as you swore a king,
Is say: thereof your grace, thy widow toWeleve,
is you and both, legs are that I subdue and speared
Could be cowe a turn; then: for you to die
Where was more in his eyes, yours, finishes, he
should but Vienna, and kneel thee no blood of esteeming irlt
Tullus Attony; Give to your enemies' elieves?

HASTINGS:
So Chillips we please you like on the armour.

TYBALT:
Bolded the house; I have yet stout it.

MONTAGUE:
As burthen me as come from me impossible:
And for mortal,
The adeep-breathe anuuted, that you makes-king
Have ceemed in him. Lord remember pyrthy crow
Derives no leisurp words on their stands,
But laught me miserable at with us
and some unatterites: he when our committed
For every malitive a
Tired approbribation of what he saw said thousand deface?
To all the poor corow friar! it is to such poor.

DUKE VINCENTIO:
No, but I say, nor Paulina, or thy crown behind; but which that thou spakest to vail them in
delight. I have it been too lately