In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
shakespeare = open('../gpt/input.txt', 'r').read()

In [3]:
print(shakespeare[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
vocab = sorted(list(set(''.join(shakespeare))))
# print(''.join(vocab))
vocab_size = len(vocab)
vocab_size

65

In [5]:
stoi = {ch:i for i,ch in enumerate(vocab)}
itos = {i:ch for ch,i in stoi.items()}

# encode list of characters to list of integers
encode = lambda s: [stoi[ch] for ch in s]
# encode('Hello')

# decode list of int to list of chars
decode = lambda l: ''.join([itos[i] for i in l])
# decode(encode('Hello'))

In [6]:
data = torch.tensor(encode(shakespeare))
len(data) == len(shakespeare)

True

In [7]:
type(data)

torch.Tensor

In [8]:
data[:10], shakespeare[:10]

(tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]), 'First Citi')

In [9]:
# prepare inputs n outputs

block_size = 8 # time dimension
# [18, 47, 56, 57]  --> [58]
# [18]              --> [47]
# [18, 47]          --> [56]
# [18, 47, 56]      --> [57]

x, y = data[:block_size], data[1:block_size+1]
print(x, y)
print('-'*20)
for t in range(block_size):
    inp = x[:t+1]
    out = y[t]
    print(f'{inp} --> {out}')

tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor([47, 56, 57, 58,  1, 15, 47, 58])
--------------------
tensor([18]) --> 47
tensor([18, 47]) --> 56
tensor([18, 47, 56]) --> 57
tensor([18, 47, 56, 57]) --> 58
tensor([18, 47, 56, 57, 58]) --> 1
tensor([18, 47, 56, 57, 58,  1]) --> 15
tensor([18, 47, 56, 57, 58,  1, 15]) --> 47
tensor([18, 47, 56, 57, 58,  1, 15, 47]) --> 58


In [10]:
batch_size = 4

ix = torch.randint(0, len(data), (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix], dim=0)
y = torch.stack([data[i+1 : i+1 + block_size] for i in ix], dim=0)
print(x, '\n' ,y)

tensor([[43, 12,  0,  0, 23, 21, 26, 19],
        [43,  1, 39, 57,  1, 39,  1, 50],
        [47, 52,  6,  1, 59, 54, 11,  1],
        [ 6,  0, 13, 52, 42,  1, 40, 39]]) 
 tensor([[12,  0,  0, 23, 21, 26, 19,  1],
        [ 1, 39, 57,  1, 39,  1, 50, 39],
        [52,  6,  1, 59, 54, 11,  1, 63],
        [ 0, 13, 52, 42,  1, 40, 39, 49]])


In [11]:
n = int(len(data)*0.9)
train = data[:n]
val = data[n:]

In [12]:
def get_batch(split, bs):
    d = train if split=='train' else val
    ix = torch.randint(0, len(d) - block_size, (batch_size,))
    xb = torch.stack([d[i   : i +   block_size] for i in ix], dim=0)
    yb = torch.stack([d[i+1 : i+1 + block_size] for i in ix], dim=0)
    return xb, yb

xb, yb = get_batch('train', bs=batch_size)
print(xb, '\n', yb)
print()

for b in range(batch_size):
    for t in range(block_size):
        print(f'{xb[b, :t+1]} --> {yb[b, t]}')

tensor([[52,  1, 54, 56, 47, 57, 53, 52],
        [43, 39, 42, 11,  1, 63, 53, 59],
        [59, 50, 42,  1, 58, 46, 47, 57],
        [12,  0,  0, 16, 33, 23, 17,  1]]) 
 tensor([[ 1, 54, 56, 47, 57, 53, 52,  6],
        [39, 42, 11,  1, 63, 53, 59, 56],
        [50, 42,  1, 58, 46, 47, 57,  1],
        [ 0,  0, 16, 33, 23, 17,  1, 27]])

tensor([52]) --> 1
tensor([52,  1]) --> 54
tensor([52,  1, 54]) --> 56
tensor([52,  1, 54, 56]) --> 47
tensor([52,  1, 54, 56, 47]) --> 57
tensor([52,  1, 54, 56, 47, 57]) --> 53
tensor([52,  1, 54, 56, 47, 57, 53]) --> 52
tensor([52,  1, 54, 56, 47, 57, 53, 52]) --> 6
tensor([43]) --> 39
tensor([43, 39]) --> 42
tensor([43, 39, 42]) --> 11
tensor([43, 39, 42, 11]) --> 1
tensor([43, 39, 42, 11,  1]) --> 63
tensor([43, 39, 42, 11,  1, 63]) --> 53
tensor([43, 39, 42, 11,  1, 63, 53]) --> 59
tensor([43, 39, 42, 11,  1, 63, 53, 59]) --> 56
tensor([59]) --> 50
tensor([59, 50]) --> 42
tensor([59, 50, 42]) --> 1
tensor([59, 50, 42,  1]) --> 58
tensor([59, 50,

In [13]:
class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets = None):
        logits = self.token_embedding_table(x) # (b, t, vocab_size)

        if targets is None:
            loss = None
        else:
            # targets: (b, t)                                   --> (b*t)
            # logits: (b, t, c) - c: channels --> (b, c, t)     --> (b*t, c)
            # loss = F.cross_entropy(logits.transpose(-1,-2), targets)

            B,T,C = logits.shape
            logits_new = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits_new, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        # idx: (b, t)
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # (b, t, vocab_size)
            logits = logits[:, -1, :] # (b, vocab_size)
            probs = F.softmax(logits, dim=-1) # (b, vocab_size)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_new), dim=-1) # (b, t+1)
        return idx

model = BigramLM()
logits, loss = model(xb, yb)
# loss.item()

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))


fLHubkquuedbo'heYWmxWefLguwcIWHXHEM;Ej!xdmWpmRU 3'UDp'CGfSlKEvGUuPNuTtJKxlT3'dN;GR.OuVpEyTArOarqICq:


In [14]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3)

In [15]:
max_iters = 10000
batch_size = 32

for _ in range(max_iters):
    # get a batch
    xb, yb = get_batch('train', bs=batch_size)

    # forward pass
    logits, loss = model(xb, yb)

    # set grad to None and backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(f'{loss.item() = }')

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))

loss.item() = 2.4846417903900146

TCFOMErey sthdishee so w d adoustrgameres my therat hy Le wad t ngfe hyowangoupo:
BRCEYCxs.
ANus; ur


# Adding a diversion of `n_embd`

In [16]:
eval_iters = 250

# evaluate loss
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split, bs=batch_size)
            _, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
n_embd = 32

class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, x, targets = None):
        tok_emb = self.token_embedding_table(x) # (b, t, n_embd)
        logits = self.lm_head(tok_emb) # (b, t, vocab_size)

        if targets is None:
            loss = None
        else:
            # targets: (b, t)                                   --> (b*t)
            # logits: (b, t, c) - c: channels --> (b, c, t)     --> (b*t, c)
            # loss = F.cross_entropy(logits.transpose(-1,-2), targets)

            B,T,C = logits.shape
            logits_new = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits_new, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        # idx: (b, t)
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # (b, vocab_size)
            logits = logits[:, -1, :] # (b, vocab_size)
            probs = F.softmax(logits, dim=-1) # (b, vocab_size)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_new), dim=-1) # (b, t+1)
        return idx

model = BigramLM()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3)

In [18]:
max_iters = 10000
batch_size = 32
eval_interval = 1000

for iter in range(max_iters):
    # get a batch
    xb, yb = get_batch('train', bs=batch_size)

    # forward pass
    logits, loss = model(xb, yb)

    if iter % eval_interval == 0:
        out = estimate_loss()
        print(f'Iteration {iter + 1} : Train Loss = {out['train']:.4f}, Validation Loss = {out['val']:.4f}')

    # set grad to None and backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Iteration 1 : Train Loss = 4.3357, Validation Loss = 4.3406
Iteration 1001 : Train Loss = 2.5704, Validation Loss = 2.5716
Iteration 2001 : Train Loss = 2.5110, Validation Loss = 2.5100
Iteration 3001 : Train Loss = 2.4873, Validation Loss = 2.5049
Iteration 4001 : Train Loss = 2.4798, Validation Loss = 2.4967
Iteration 5001 : Train Loss = 2.4764, Validation Loss = 2.4949
Iteration 6001 : Train Loss = 2.4661, Validation Loss = 2.4998
Iteration 7001 : Train Loss = 2.4687, Validation Loss = 2.4924
Iteration 8001 : Train Loss = 2.4629, Validation Loss = 2.4920
Iteration 9001 : Train Loss = 2.4697, Validation Loss = 2.4860


In [19]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx)[0].tolist()))


int wind.
bu oure paner

Tincoor, be 'd ay, ancs m G pomwovel ape st licholath k. wnt t t.

MIVENThe


# Self attention tricks

In [20]:
B,T,C = 4,8,2
x = torch.randn((B,T,C))
x.shape

torch.Size([4, 8, 2])

In [21]:
x[0] # (t,c)

tensor([[ 1.7236, -1.2030],
        [-0.1934,  0.9959],
        [-1.1941, -0.2165],
        [-0.6883,  0.0201],
        [-0.3822, -0.9448],
        [-1.0139,  0.8695],
        [ 0.8555, -1.4451],
        [ 1.7656, -1.3940]])

In [43]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        prev = x[b, :t+1] # (t,c)
        xbow[b,t] = prev.mean(dim=0) # (c,)
xbow.shape

torch.Size([4, 8, 2])

In [44]:
# xbow[0,1] == (x[0,0] + x[0,1])
xbow[0]

tensor([[ 1.7236, -1.2030],
        [ 0.7651, -0.1036],
        [ 0.1120, -0.1412],
        [-0.0880, -0.1009],
        [-0.1469, -0.2697],
        [-0.2914, -0.0798],
        [-0.1275, -0.2749],
        [ 0.1091, -0.4148]])

In [45]:
# method 2

wei = torch.tril(torch.ones(T,T))
wei /= wei.sum(dim=1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [46]:
xbow2 = wei @ x # (t,t) @ (b,t,c) --> (b,t,c)
xbow2[0]

tensor([[ 1.7236, -1.2030],
        [ 0.7651, -0.1036],
        [ 0.1120, -0.1412],
        [-0.0880, -0.1009],
        [-0.1469, -0.2697],
        [-0.2914, -0.0798],
        [-0.1275, -0.2749],
        [ 0.1091, -0.4148]])

In [47]:
torch.allclose(xbow, xbow2)

True

In [51]:
# method 3

tril = torch.tril(torch.ones(T,T))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [54]:
wei = torch.zeros(T,T)
wei.masked_fill_(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [55]:
xbow3 = wei @ x

In [56]:
torch.allclose(xbow, xbow3)

True

In [69]:
# self attention - 1 Head

n_head = 16

query = nn.Linear(C, n_head)
key = nn.Linear(C, n_head)
value = nn.Linear(C, n_head)
q = query(x) # (b,t,c) @ (c,n_head) --> (b,t,n_head)
# q[0][0]: (n_head,) --> query for 1st token in 1st sample in the batch

k = key(x) # (b,t,c) @ (c,n_head) --> (b,t,n_head)

wei = q @ k.transpose(-1,-2) # (b,t,n_head) @ (b,n_head,t) --> (b,t,t)
# wei.shape

wei = wei.masked_fill_(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # (b,t,n_head)
out = wei @ v # (b,t,t) @ (b,t,n_head) --> (b,t,n_head)
out.shape

torch.Size([4, 8, 16])

In [None]:
# wei[0]

tensor([[-1.2169,  2.7673,  4.0862,  3.2983,  2.4641,  4.1285,  0.1882, -1.3504],
        [ 1.3089, -0.3266, -0.5393, -0.3296,  0.0983, -0.7367,  0.9044,  1.3835],
        [ 0.5735, -0.9772, -4.1071, -2.8949, -3.2507, -2.6907, -1.3457,  0.4672],
        [ 0.6178, -0.4946, -2.4876, -1.7053, -1.8949, -1.6097, -0.6265,  0.5569],
        [-0.3196,  0.2502, -2.0488, -1.3006, -2.0667, -0.6804, -1.4232, -0.4891],
        [ 1.4609, -1.2147, -3.0409, -2.1863, -1.8707, -2.5543,  0.0240,  1.4937],
        [-1.1503,  1.8720,  1.3867,  1.3032,  0.2842,  2.2325, -0.8635, -1.3414],
        [-1.3966,  2.8935,  4.1179,  3.3373,  2.3882,  4.2707,  0.0138, -1.5521]],
       grad_fn=<SelectBackward0>)

In [None]:
# wei = wei.masked_fill_(tril==0, float('-inf'))
# wei = F.softmax(wei, dim=-1)
# wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8369, 0.1631, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8187, 0.1737, 0.0076, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6796, 0.2234, 0.0304, 0.0666, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2862, 0.5059, 0.0508, 0.1073, 0.0499, 0.0000, 0.0000, 0.0000],
        [0.8622, 0.0594, 0.0096, 0.0225, 0.0308, 0.0156, 0.0000, 0.0000],
        [0.0124, 0.2542, 0.1565, 0.1439, 0.0520, 0.3646, 0.0165, 0.0000],
        [0.0013, 0.0943, 0.3207, 0.1469, 0.0569, 0.3736, 0.0053, 0.0011]],
       grad_fn=<SelectBackward0>)