In [1]:
data_file = '../gpt/input.txt'
text = open(data_file, 'r').read()

In [2]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## testing tiktoken

In [3]:
import tiktoken
tok = tiktoken.get_encoding('cl100k_base')

In [4]:
tok.encode('hi there !')

[6151, 1070, 758]

In [5]:
tok.decode_single_token_bytes(6151)

b'hi'

In [6]:
tok.decode_single_token_bytes(1070)

b' there'

In [7]:
tok.decode_single_token_bytes(758)

b' !'

## `char` level tokenization

In [31]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print(vocab_size)
print(''.join(vocab))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [32]:
stoi = {ch:i for i,ch in enumerate(vocab)}
itos = {i:ch for i,ch in enumerate(vocab)}
print(itos)

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}


In [33]:
encode = lambda text: [stoi[ch] for ch in text]
decode = lambda tokens: ''.join([itos[ix] for ix in tokens])

In [34]:
import torch
data = torch.tensor(encode(text))
print(data[:10])
print(data.dtype)

n = int(0.9 * len(data))
train = data[:n]
test = data[n:]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
torch.int64


## visualizing a chunk of text in encoded format with targets for GPT

In [7]:
block_size = 8
x = data[:block_size]
y = data[1:block_size+1] # shifted to right by 1, from x
print(x)
print(y)

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])


In [8]:
for i in range(block_size):
    print(f'input {x[:i+1]} --> target {y[i]}')

input tensor([18]) --> target 47
input tensor([18, 47]) --> target 56
input tensor([18, 47, 56]) --> target 57
input tensor([18, 47, 56, 57]) --> target 58
input tensor([18, 47, 56, 57, 58]) --> target 1
input tensor([18, 47, 56, 57, 58,  1]) --> target 15
input tensor([18, 47, 56, 57, 58,  1, 15]) --> target 47
input tensor([18, 47, 56, 57, 58,  1, 15, 47]) --> target 58


### introduce `batch` dimension

In [18]:
torch.manual_seed(1337)

block_size = 8
batch_size = 4

def get_batch(split, block_size=8, batch_size=8, device='cpu'):
    data = train if split == 'train' else test
    ix = torch.randint(high=len(data)-block_size, size=(batch_size,))
    x = torch.stack([data[i   : i   + block_size] for i in ix])
    y = torch.stack([data[i+1 : i+1 + block_size] for i in ix])
    return x.to(device), y.to(device)

x,y = get_batch('train', block_size=8, batch_size=4)
print(x)
print(y)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [10]:
# this is how inputs to a GPT model would look like
# context-length ranging from 1-block_size

B,T = x.shape
for b in range(batch_size):
    for t in range(block_size):
        context = x[b, :t+1]
        target =  y[b, t  ]
        print(f'when input is {context} : target is {target}')

when input is tensor([24]) : target is 43
when input is tensor([24, 43]) : target is 58
when input is tensor([24, 43, 58]) : target is 5
when input is tensor([24, 43, 58,  5]) : target is 57
when input is tensor([24, 43, 58,  5, 57]) : target is 1
when input is tensor([24, 43, 58,  5, 57,  1]) : target is 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) : target is 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) : target is 39
when input is tensor([44]) : target is 53
when input is tensor([44, 53]) : target is 56
when input is tensor([44, 53, 56]) : target is 1
when input is tensor([44, 53, 56,  1]) : target is 58
when input is tensor([44, 53, 56,  1, 58]) : target is 46
when input is tensor([44, 53, 56,  1, 58, 46]) : target is 39
when input is tensor([44, 53, 56,  1, 58, 46, 39]) : target is 58
when input is tensor([44, 53, 56,  1, 58, 46, 39, 58]) : target is 1
when input is tensor([52]) : target is 58
when input is tensor([52, 58]) : target is 1
when input is tensor(

## Model 1
## Bigram LM

In [16]:

import torch.nn as nn
import torch.nn.functional as F

class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.tok_embedding(idx) # (B,T,vocab_size)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # ids: B,T
        for _ in range(max_new_tokens):
            logits,_ = self(idx) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx

In [17]:
model = BigramLM()
logits, loss = model(x,y)
# print(logits.shape)
# print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(idx)
idx = model.generate(idx, max_new_tokens=100)
print(idx)

print(decode(idx[0].tolist()))

tensor([[0]])
tensor([[ 0, 50, 44, 22, 43, 59, 49, 30, 59, 39, 30, 22, 23, 36, 13, 37, 58, 36,
         64, 44, 22, 10, 20, 17, 28, 47, 59,  7,  7, 57, 16, 47, 53, 47, 11, 21,
         24, 15, 53,  9, 54, 20, 26, 32, 51, 16, 61, 22, 57, 44, 46, 43, 23, 30,
         62, 38, 15, 18, 57,  0, 50, 38, 22,  1, 36, 29, 41, 12, 10, 57, 10, 20,
         17, 64, 17, 52, 36, 39, 50, 17, 28, 49, 50, 41, 28, 33,  1, 41, 24,  5,
         16, 54, 42, 24, 15, 39, 44, 14, 46, 43, 20]])

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

In [19]:
block_size=8
batch_size=4
for iter in range(1000):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 100 == 0:
        print(f'{loss=}')

loss=tensor(4.8740, grad_fn=<NllLossBackward0>)
loss=tensor(4.1951, grad_fn=<NllLossBackward0>)
loss=tensor(3.3270, grad_fn=<NllLossBackward0>)
loss=tensor(3.0485, grad_fn=<NllLossBackward0>)
loss=tensor(3.0234, grad_fn=<NllLossBackward0>)
loss=tensor(2.6585, grad_fn=<NllLossBackward0>)
loss=tensor(2.5237, grad_fn=<NllLossBackward0>)
loss=tensor(2.6933, grad_fn=<NllLossBackward0>)
loss=tensor(2.5405, grad_fn=<NllLossBackward0>)
loss=tensor(2.2774, grad_fn=<NllLossBackward0>)


In [20]:
idx = torch.zeros((1,1), dtype=torch.long)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))


NqBfe o m!d omry-mo y an?k't d looSvar thavend tthanI pt BYQkir
N
kink llll heescerge.
As
Wim heANve


## Model 2
## let's build some diversion and add positional embeddings, for better modeling

In [21]:
# Bigram LM
import torch.nn as nn
import torch.nn.functional as F

class BigramLM(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets=None):
        tok_emb = self.tok_embedding(idx) # (B,T,vocab_size)
        logits = self.lm_head(tok_emb)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # ids: B,T
        for _ in range(max_new_tokens):
            logits,_ = self(idx) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx

In [22]:
block_size=8
batch_size=4
for iter in range(1000):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 100 == 0:
        print(f'{loss=}')

loss=tensor(2.5480, grad_fn=<NllLossBackward0>)
loss=tensor(2.7870, grad_fn=<NllLossBackward0>)
loss=tensor(2.6660, grad_fn=<NllLossBackward0>)
loss=tensor(2.4883, grad_fn=<NllLossBackward0>)
loss=tensor(2.7111, grad_fn=<NllLossBackward0>)
loss=tensor(2.4736, grad_fn=<NllLossBackward0>)
loss=tensor(2.3161, grad_fn=<NllLossBackward0>)
loss=tensor(2.4362, grad_fn=<NllLossBackward0>)
loss=tensor(2.3136, grad_fn=<NllLossBackward0>)
loss=tensor(2.5640, grad_fn=<NllLossBackward0>)


In [23]:
idx = torch.zeros((1,1), dtype=torch.long)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))


Bon ntrete .
th havef i$MWingithacu ind't ho I hamyon!'s


ANESTHite:
MMICLIfiror tlld chaitarslsen'


In [11]:
eval_iters = 200
device = "mps"# if torch.backends.mps.is_available() else "cpu"
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y = get_batch(split, device=device)
            _,loss = model(x,y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [12]:
block_size=8
batch_size=4
n_embd = 16

model = BigramLM(n_embd)
model = model.to(device)
print(f'{sum([p.numel() for p in model.parameters()])} params')
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)

for iter in range(1000):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size, device=device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 100 == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

NameError: name 'BigramLM' is not defined

In [26]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))


Uoasthapdse tizenderst els yu frnie hy:


Hak, COI teg aglellthorr gtecowor hend ge?
Ten, reothakech


## Model 3

In [27]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

mps


In [28]:
class GPT(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.tok_embedding(idx) # (B,T,n_embd)
        pos_emb = self.pos_embedding(torch.arange(T, device=device)) # (T,n_embd)
        logits = self.lm_head(tok_emb + pos_emb)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx: B,T
        B,T = idx.shape
        idx_chopped = idx[:, -block_size:]

        for _ in range(max_new_tokens):
            logits,_ = self(idx_chopped) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx


block_size=8
batch_size=4
n_embd = 16

model = GPT(n_embd)
print(f'{sum([p.numel() for p in model.parameters()])} params')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)

for iter in range(1000):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    xb,yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 100 == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

2273 params
iter: 0 | train loss = 4.6241 | test loss = 4.6222
iter: 100 | train loss = 3.2371 | test loss = 3.2595
iter: 200 | train loss = 2.9886 | test loss = 2.9884
iter: 300 | train loss = 2.8827 | test loss = 2.8730
iter: 400 | train loss = 2.7515 | test loss = 2.7698
iter: 500 | train loss = 2.7015 | test loss = 2.7233
iter: 600 | train loss = 2.6965 | test loss = 2.6799
iter: 700 | train loss = 2.6509 | test loss = 2.6503
iter: 800 | train loss = 2.6213 | test loss = 2.6359
iter: 900 | train loss = 2.6014 | test loss = 2.6268


In [29]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))


HPWLAHPYD
p
IMIAPDGAA IF
S
bTGDTDTAMDYTI
TAkFT
NAI-hDN
L:

OTAAGSAIASO
 N
FWBAAIFADAIFAIBTT
!'AcC
AA


## Self-attention tricks

### 1. Basic idea
- an element in a batch has to become `block_size` distinct examples
- incrementaly the examples will **aggregate** information from all previously occuring tokens

In [30]:
B,T,C = 2,4,2

x = torch.randn((B,T,C))
x.shape

torch.Size([2, 4, 2])

In [31]:
x

tensor([[[-0.5296, -0.0165],
         [-1.1309, -0.1778],
         [-0.5091, -2.4500],
         [-1.2056,  0.2115]],

        [[ 1.7162, -0.5178],
         [-1.6298, -1.5357],
         [ 0.1144,  1.0883],
         [ 0.5618, -0.3086]]])

In [32]:
xbow = torch.zeros_like(x)
for b in range(B):
    for t in range(T):
        prev = x[b, :t+1]
        xbow[b, t] = prev.mean(dim=0)
xbow

tensor([[[-0.5296, -0.0165],
         [-0.8303, -0.0972],
         [-0.7232, -0.8814],
         [-0.8438, -0.6082]],

        [[ 1.7162, -0.5178],
         [ 0.0432, -1.0267],
         [ 0.0669, -0.3217],
         [ 0.1906, -0.3184]]])

### 2. Matrix multiply using a triangular matrix

In [33]:
a = torch.tril(torch.ones((T,T)))
a = a / a.sum(dim=1, keepdim=True)
a

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

In [34]:
xbow2 = a @ x # (t,t) @ (b,t,c) -> (_,t,t) @ (b,t,c) -> (b,t,c)
xbow

tensor([[[-0.5296, -0.0165],
         [-0.8303, -0.0972],
         [-0.7232, -0.8814],
         [-0.8438, -0.6082]],

        [[ 1.7162, -0.5178],
         [ 0.0432, -1.0267],
         [ 0.0669, -0.3217],
         [ 0.1906, -0.3184]]])

In [35]:
torch.allclose(xbow, xbow2)

True

### 3. Using softmax

In [36]:
tril = torch.tril(torch.ones((T,T)))

wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

xbow3 = wei @ x # (t,t) @ (b,t,c) -> (b,t,c)

In [37]:
torch.allclose(xbow2, xbow3)

True

### 4. self-attention

In [38]:
head_size = 16

query = nn.Linear(C, head_size)
key = nn.Linear(C, head_size)
value = nn.Linear(C, head_size)

q = query(x) # (b,t,head_size)
k = key(x) # (b,t,head_size)
v = value(x) # (b,t,head_size)

wei = (q @ k.transpose(-1,-2)) / head_size ** 0.5 # (b,t,head_size) @ (b,head_size,t) -> (b,t,t)

tril = torch.tril(torch.ones((T,T)))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei @ v # (b,t,t) @ (b,t,head_size) -> (b,t,head_size)
out.shape

torch.Size([2, 4, 16])

In [39]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000],
         [0.4843, 0.5157, 0.0000, 0.0000],
         [0.3075, 0.5056, 0.1868, 0.0000],
         [0.2823, 0.2793, 0.1133, 0.3251]],

        [[1.0000, 0.0000, 0.0000, 0.0000],
         [0.1683, 0.8317, 0.0000, 0.0000],
         [0.4849, 0.2289, 0.2862, 0.0000],
         [0.1907, 0.4075, 0.1803, 0.2215]]], grad_fn=<SoftmaxBackward0>)

## Plugging "self-attention" in our model

In [40]:
block_size=8
batch_size=32
n_embd = 32
# head_size = 16

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (b,t,head_size)
        q = self.query(x) # (b,t,head_size)
        v = self.value(x) # (b,t,head_size)

        wei = (q @ k.transpose(-1,-2)) / head_size ** 0.5 # (b,t,head_size) @ (b,head_size,t) -> (b,t,t)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        out = wei @ v # (b,t,t) @ (b,t,head_size) -> (b,t,head_size)
        return out

class GPT(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.tok_embedding(idx) # (B,T,n_embd)
        pos_emb = self.pos_embedding(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx: B,T
        B,T = idx.shape
        idx_chopped = idx[:, -block_size:]

        for _ in range(max_new_tokens):
            logits,_ = self(idx_chopped) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx

model = GPT(n_embd)
print(f'{sum([p.numel() for p in model.parameters()])} params')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

max_iters = 5000
for iter in range(max_iters):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    xb,yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 1000 == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))

7553 params
iter: 0 | train loss = 4.1184 | test loss = 4.1204
iter: 1000 | train loss = 2.4777 | test loss = 2.4819
iter: 2000 | train loss = 2.4394 | test loss = 2.4265
iter: 3000 | train loss = 2.3896 | test loss = 2.4276
iter: 4000 | train loss = 2.4021 | test loss = 2.4174

NAAFT
TSKTIONTRFSIDTTIEUWLHAWLSAQAnTTICCLFAA'AILBTITgWALHREQGTASTPLNLMnFCWSBIIRBLTBGBCADBlB
TQHuCAiS


## Plug "multi-headed attention"

In [41]:
block_size=8
batch_size=32
n_embd = 32
# head_size = 16

class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

    def forward(self, x):
        # return self.heads(x)
        return torch.cat([h(x) for h in self.heads], dim=-1)

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (b,t,head_size)
        q = self.query(x) # (b,t,head_size)
        v = self.value(x) # (b,t,head_size)

        wei = (q @ k.transpose(-1,-2)) / head_size ** 0.5 # (b,t,head_size) @ (b,head_size,t) -> (b,t,t)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        out = wei @ v # (b,t,t) @ (b,t,head_size) -> (b,t,head_size)
        return out

class GPT(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadedAttention(4, n_embd//4)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.tok_embedding(idx) # (B,T,n_embd)
        pos_emb = self.pos_embedding(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb
        x = self.sa_heads(x)
        logits = self.lm_head(x)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx: B,T
        B,T = idx.shape
        idx_chopped = idx[:, -block_size:]

        for _ in range(max_new_tokens):
            logits,_ = self(idx_chopped) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx

model = GPT(n_embd)
print(f'{sum([p.numel() for p in model.parameters()])} params')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

max_iters = 5000
for iter in range(max_iters):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    xb,yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 1000 == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))

7553 params
iter: 0 | train loss = 3.9767 | test loss = 3.9892
iter: 1000 | train loss = 2.3291 | test loss = 2.3421
iter: 2000 | train loss = 2.2475 | test loss = 2.2983
iter: 3000 | train loss = 2.2000 | test loss = 2.2958
iter: 4000 | train loss = 2.1946 | test loss = 2.2668

IWMWSrSMUO

ICCTmc

JETEBWTIPIBS
IE
mTbOTEBYLBPGAmOHoVH
ABVBfTTtI

MNWIMBHBfFPBM
ABP
GHLIDAATRAHMmTH


## Plug in "feed forward network" to the model

In [42]:
block_size=8
batch_size=32
n_embd = 32
# head_size = 16

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.ReLU(),
        )
    
    def forward(self, x):
        return self.net(x)

class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

    def forward(self, x):
        # return self.heads(x)
        return torch.cat([h(x) for h in self.heads], dim=-1)

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (b,t,head_size)
        q = self.query(x) # (b,t,head_size)
        v = self.value(x) # (b,t,head_size)

        wei = (q @ k.transpose(-1,-2)) / head_size ** 0.5 # (b,t,head_size) @ (b,head_size,t) -> (b,t,t)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        out = wei @ v # (b,t,t) @ (b,t,head_size) -> (b,t,head_size)
        return out

class GPT(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadedAttention(4, n_embd//4)
        self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.tok_embedding(idx) # (B,T,n_embd)
        pos_emb = self.pos_embedding(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb # (b,t,n_embd)
        x = self.sa_heads(x) # (b,t,n_embd)
        x = self.ffwd(x) # (b,t,n_embd)
        logits = self.lm_head(x) # (b,t,vocab_size)

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx: B,T
        B,T = idx.shape
        idx_chopped = idx[:, -block_size:]

        for _ in range(max_new_tokens):
            logits,_ = self(idx_chopped) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        return idx

model = GPT(n_embd)
print(f'{sum([p.numel() for p in model.parameters()])} params')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

max_iters = 5000
for iter in range(max_iters):
    xb,yb = get_batch('train', block_size=block_size, batch_size=batch_size)
    xb,yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 1000 == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=100)

print(decode(idx[0].tolist()))

15905 params
iter: 0 | train loss = 4.1050 | test loss = 4.1043
iter: 1000 | train loss = 2.2440 | test loss = 2.2881
iter: 2000 | train loss = 2.1746 | test loss = 2.2144
iter: 3000 | train loss = 2.1351 | test loss = 2.2291
iter: 4000 | train loss = 2.1139 | test loss = 2.2423

FEYTSpTTECrWHIADJYNDDWMDTAIESYTWSSTTSG
-UAAQs NwNNFTOKQyFABeTTMeNQEEITAUyBGMj,HKVFhiOWrHFe.F IJASWyY


## Add MHA & FFWD to `Block`; and  plug in "Residual connections" within blocks

In [None]:
import torch
import math
import torch.nn as nn
from torch.nn import functional as F

block_size=256
batch_size=64
n_embd = 384
n_heads = 6
n_layer = 12
dropout = 0.2
eval_iters = 200
eval_interval = 500
device = "cuda" if torch.cuda.is_available() else "cpu"

def get_batch(split):
    data = train if split == 'train' else test
    ix = torch.randint(high=len(data)-block_size, size=(batch_size,))
    x = torch.stack([data[i   : i   + block_size] for i in ix])
    y = torch.stack([data[i+1 : i+1 + block_size] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters, device=device)
        for k in range(eval_iters):
            x,y = get_batch(split)
            _,loss = model(x,y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Block(nn.Module):
    def __init__(self, n_heads, n_embd):
        super().__init__()
        head_size = n_embd // n_heads
        self.heads = MultiHeadedAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.heads(self.ln1(x)) # (b,t,n_embd)
        x = x + self.ffwd(self.ln2(x)) # (b,t,n_embd)
        return x

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # return self.heads(x)
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (b,t,n_embd)
        out = self.dropout(self.proj(out))
        return out  # (b,t,n_embd)

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (b,t,head_size)
        q = self.query(x) # (b,t,head_size)
        v = self.value(x) # (b,t,head_size)

        # wei = (q @ k.transpose(-2,-1)) * self.head_size ** -0.5 # (b,t,head_size) @ (b,head_size,t) -> (b,t,t)
        wei = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.head_size) # (b,t,t)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ v # (b,t,t) @ (b,t,head_size) -> (b,t,head_size)
        return out

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.tok_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        blocks = [Block(n_heads, n_embd) for _ in range(n_layer)]
        self.blocks = nn.Sequential(*blocks)
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.tok_embedding(idx) # (B,T,n_embd)
        pos_emb = self.pos_embedding(torch.arange(T, device=idx.device)) # (T,n_embd)
        x = tok_emb + pos_emb # (b,t,n_embd)
        x = self.blocks(x)
        logits = self.lm_head(self.ln_f(x)) # (b,t,vocab_size)x

        B,T,C = logits.shape
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T,vocab_size), targets.view(B*T))
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx: B,T
        B,T = idx.shape
        self.eval()
        for _ in range(max_new_tokens):
            idx_chopped = idx[:, -block_size:]
            logits,_ = self(idx_chopped) # (B,T,C)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,1)
            new_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, new_idx), dim=1) # (B,T+1)
        self.train()
        return idx

model = GPT()
print(f'{sum([p.numel() for p in model.parameters()])} params')
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

max_iters = 5000
for iter in range(max_iters):
    xb,yb = get_batch('train')
    # xb,yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if iter % eval_interval == 0:
        out = estimate_loss()
        print(f'iter: {iter} | train loss = {out['train']:.4f} | test loss = {out['test']:.4f}')

idx = torch.zeros((1,1), dtype=torch.long, device=device)
idx = model.generate(idx, max_new_tokens=500)
print(decode(idx[0].tolist()))

80128 params
iter: 0 | train loss = 4.2674 | test loss = 4.2777
iter: 500 | train loss = 2.6528 | test loss = 2.6731
iter: 1000 | train loss = 2.4729 | test loss = 2.4817
iter: 1500 | train loss = 2.3998 | test loss = 2.4054
iter: 2000 | train loss = 2.3446 | test loss = 2.3535
iter: 2500 | train loss = 2.2893 | test loss = 2.3150
iter: 3000 | train loss = 2.2585 | test loss = 2.2842
iter: 3500 | train loss = 2.2332 | test loss = 2.2431
iter: 4000 | train loss = 2.2107 | test loss = 2.2248
iter: 4500 | train loss = 2.1992 | test loss = 2.2027
tensor([ 0,  1, 47, 52, 44, 59, 53, 56,  1, 47, 57,  1, 57, 43, 39, 50, 50,  1,
        51, 43,  8,  1, 35, 53, 56, 42,  1, 51, 39, 41, 43,  1, 41, 46, 39, 58,
         8,  0,  0, 16, 33, 23, 21, 38, 13, 16, 53, 50, 50,  1, 47, 52,  1, 53,
        44,  1, 58, 46, 39, 57,  1, 58, 53,  1, 39, 56, 42,  1, 63, 53, 59, 56,
         1, 19, 53, 44,  1, 39, 57, 61, 43, 52,  1, 45, 53,  1, 46, 43,  1, 56,
        43,  5, 42,  1, 56, 43, 44, 53, 52,  1, 58]