In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import requests
from pathlib import Path

In [2]:
if not Path('input.txt').is_file():
    print('Downloading \'input.txt\'...')
    with open('input.txt', 'wb') as f:
        request = requests.get('https://github.com/karpathy/ng-video-lecture/raw/master/input.txt')
        f.write(request.content)
else:
    print('\'input.txt\' already exists, skipping download')

with open('input.txt', 'r') as f:
    text = f.read()

print(f'The length of text is {len(text)}')

'input.txt' already exists, skipping download
The length of text is 1115394


In [3]:
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

65


In [5]:
# We can use tiktoken tokenizer which was used for gpt2 or sentencepiece which was developed by google but we're keeping it simple for now

stoi = {k:v for v,k in enumerate(chars)}
itos = {k:v for k,v in enumerate(chars)}
encode = lambda word: [stoi[x] for x in word]
decode = lambda word: ''.join([itos[x] for x in word])

print(encode('Hi'))
print(decode(encode('Hi')))

[20, 47]
Hi


In [6]:
data = torch.tensor(encode(text), dtype=torch.int64)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
# Train and test split
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]
print(f'Train split: {len(train_data)} | Test split: {len(test_data)}')

Train split: 1003854 | Test split: 111540


In [8]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'When the input is {context}, the target is {target}')

When the input is tensor([18]), the target is 47
When the input is tensor([18, 47]), the target is 56
When the input is tensor([18, 47, 56]), the target is 57
When the input is tensor([18, 47, 56, 57]), the target is 58
When the input is tensor([18, 47, 56, 57, 58]), the target is 1
When the input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [10]:
torch.manual_seed(42)
batch_size = 4 # How many batches we would have
block_size = 8 # The maximum context length for prediction

def get_batch(split):
    # Generate small batch of data for X and y inputs
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, [batch_size])
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y
get_batch('train')

(tensor([[57,  1, 46, 47, 57,  1, 50, 53],
         [ 1, 58, 46, 43, 56, 43,  1, 41],
         [17, 26, 15, 17, 10,  0, 32, 53],
         [57, 58,  6,  1, 61, 47, 58, 46]]),
 tensor([[ 1, 46, 47, 57,  1, 50, 53, 60],
         [58, 46, 43, 56, 43,  1, 41, 39],
         [26, 15, 17, 10,  0, 32, 53,  1],
         [58,  6,  1, 61, 47, 58, 46,  0]]))

In [11]:
xb, yb = get_batch('train')
print(f'Inputs: {xb.shape}')
print(f'Targets: {yb.shape}')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        targets = yb[b, t]
        print(f'When the input is {context.tolist()}, the target is {targets}')

Inputs: torch.Size([4, 8])
Targets: torch.Size([4, 8])
When the input is [6], the target is 0
When the input is [6, 0], the target is 14
When the input is [6, 0, 14], the target is 43
When the input is [6, 0, 14, 43], the target is 44
When the input is [6, 0, 14, 43, 44], the target is 53
When the input is [6, 0, 14, 43, 44, 53], the target is 56
When the input is [6, 0, 14, 43, 44, 53, 56], the target is 43
When the input is [6, 0, 14, 43, 44, 53, 56, 43], the target is 1
When the input is [39], the target is 1
When the input is [39, 1], the target is 42
When the input is [39, 1, 42], the target is 59
When the input is [39, 1, 42, 59], the target is 43
When the input is [39, 1, 42, 59, 43], the target is 1
When the input is [39, 1, 42, 59, 43, 1], the target is 39
When the input is [39, 1, 42, 59, 43, 1, 39], the target is 52
When the input is [39, 1, 42, 59, 43, 1, 39, 52], the target is 42
When the input is [47], the target is 41
When the input is [47, 41], the target is 43
When the

In [14]:
# Creating baseline model_0
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x: torch.tensor, targets: torch.tensor = None) -> torch.tensor:
        # print(x.shape)
        logits = self.token_embedding_table(x)
        if targets is None:
            loss = None
        else:
            # print(f'Logits shape before sizing down: {logits.shape}')
            # print(f'logits permute {logits.permute(0,2,1).shape}')
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            # print(f'Logits shape after sizing down: {logits.shape}')
            # print(f'Targets before view: {targets}')
            targets = targets.view(B*T)
            # print(f'Targets after view: {targets}')
            loss = F.cross_entropy(logits, targets)
            # print(loss)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # print(idx)
            # focus only on the last timestep
            # print('Logits before focus:\n', logits)
            logits = logits[:, -1, :] # becomes (B, C)
            # print('Logits after focus:\n', logits)
            # get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # print(probs)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # print(f'\nidx_next: {probs.argmax(dim=1)}')
            # print(f'\nidx_next: {idx_next}')
            # apply sampled index to the running index
            idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
        return idx
        
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(f'Logits: {logits.shape}, Loss: {loss}')
print(decode(m.generate(torch.zeros(size=[1,1], dtype=torch.int64), max_new_tokens=10)[0].tolist()))

Logits: torch.Size([32, 65]), Loss: 4.837806701660156

uoiaF$z
M?


In [15]:
optimizer = torch.optim.Adam(params=m.parameters(), lr=1e-3)

In [16]:
batch_size = 32

for epoch in range(10000):
    m.train()
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch+1) % 1000 == 0:
        print(f'Epoch: {epoch + 1} | Loss: {loss:.4f}')

Epoch: 1000 | Loss: 3.7676
Epoch: 2000 | Loss: 3.1697
Epoch: 3000 | Loss: 2.7759
Epoch: 4000 | Loss: 2.5619
Epoch: 5000 | Loss: 2.5612
Epoch: 6000 | Loss: 2.6210
Epoch: 7000 | Loss: 2.4096
Epoch: 8000 | Loss: 2.5250
Epoch: 9000 | Loss: 2.4746
Epoch: 10000 | Loss: 2.4609


In [17]:
m.generate(torch.zeros(size=[1,1], dtype=torch.int64), max_new_tokens=100)

tensor([[ 0, 35, 21,  1, 51, 53, 59,  6,  1, 57, 58,  1, 61, 39,  1, 58, 46, 58,
         46, 43,  1, 39, 63,  1, 32, 35, 47, 57,  1, 61,  1, 54, 56, 43, 63,  1,
         40, 43,  1, 50, 50, 50, 53, 53, 59, 50, 63,  7, 41, 53, 57, 43, 51, 39,
         58, 11,  0, 27, 10,  0, 42,  1, 61, 39, 58,  1, 51, 53, 51, 63,  1, 39,
         56,  1, 37, 43, 50, 39, 60, 43, 52, 43, 57, 57,  6,  1, 46, 39, 57, 46,
         43,  1, 53, 59, 57,  1, 58, 46, 39, 45, 56]])

In [18]:
print(decode(m.generate(torch.zeros(size=[1,1], dtype=torch.int64), max_new_tokens=100)[0].tolist()))



ait; l ICotherer w war ha yevelise 'tWhowe murfor add hiull y wharod ongozDI rthHicirds wavente, m 


In [75]:
# Self attention
B, T, C = 4, 8, 32 # Batch, Time, Channels
x = torch.randn(size=[B, T, C])
x.shape

torch.Size([4, 8, 32])

In [76]:
# Calculating the average of all tokens along with the previous tokens
# METHOD 1
xbow = torch.zeros(size=[B,T,C])
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = xprev.mean(dim=0)

In [77]:
x[0]

tensor([[-1.2248e+00,  9.6289e-01, -1.5785e+00,  6.7160e-01, -6.0152e-02,
          6.9784e-02, -1.6635e+00, -7.6506e-01,  1.2306e+00,  4.2521e-01,
         -1.6383e-02, -1.0749e-01, -1.3086e+00,  6.5981e-01, -7.0325e-02,
          2.7448e-01, -3.4501e-01, -1.1962e-01,  1.1862e+00, -1.2203e+00,
          2.9100e-01, -7.9642e-02,  1.3200e+00, -1.5197e+00, -2.9336e-01,
          2.1066e+00, -1.0875e-01,  6.0834e-01,  7.8943e-01,  7.8247e-01,
         -6.4659e-02, -2.3021e-04],
        [ 6.8309e-01,  1.0637e-01,  3.5032e-01,  1.2110e-01,  2.9843e-01,
          1.3448e+00,  1.4614e+00,  1.0566e+00,  8.1554e-01, -8.2406e-01,
          8.9328e-01, -3.8688e-01, -3.5718e-01, -1.1568e+00, -1.7660e+00,
         -2.5380e+00,  9.6943e-02, -7.9121e-01,  3.7120e-01,  1.5118e+00,
         -8.9146e-01,  5.2475e-01,  3.5178e-01,  2.4913e-01,  1.1900e+00,
          1.4109e+00,  7.9801e-01,  4.9413e-01, -1.8495e-01, -1.0381e+00,
         -1.0130e-01, -9.2718e-01],
        [ 2.3484e-01,  8.8615e-02, -3.47

In [78]:
xbow[0]

tensor([[-1.2248e+00,  9.6289e-01, -1.5785e+00,  6.7160e-01, -6.0152e-02,
          6.9784e-02, -1.6635e+00, -7.6506e-01,  1.2306e+00,  4.2521e-01,
         -1.6383e-02, -1.0749e-01, -1.3086e+00,  6.5981e-01, -7.0325e-02,
          2.7448e-01, -3.4501e-01, -1.1962e-01,  1.1862e+00, -1.2203e+00,
          2.9100e-01, -7.9642e-02,  1.3200e+00, -1.5197e+00, -2.9336e-01,
          2.1066e+00, -1.0875e-01,  6.0834e-01,  7.8943e-01,  7.8247e-01,
         -6.4659e-02, -2.3021e-04],
        [-2.7085e-01,  5.3463e-01, -6.1411e-01,  3.9635e-01,  1.1914e-01,
          7.0728e-01, -1.0103e-01,  1.4578e-01,  1.0231e+00, -1.9942e-01,
          4.3845e-01, -2.4719e-01, -8.3287e-01, -2.4850e-01, -9.1816e-01,
         -1.1317e+00, -1.2403e-01, -4.5541e-01,  7.7868e-01,  1.4574e-01,
         -3.0023e-01,  2.2255e-01,  8.3591e-01, -6.3528e-01,  4.4835e-01,
          1.7588e+00,  3.4463e-01,  5.5124e-01,  3.0224e-01, -1.2781e-01,
         -8.2981e-02, -4.6371e-01],
        [-1.0228e-01,  3.8596e-01, -5.25

In [79]:
# Calculating the average of all tokens along with the previous tokens
# METHOD 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) * (B, T, C) ----> (T, C)... batch multiplier
torch.allclose(xbow, xbow2)

True

In [80]:
# Calculating the average of all tokens along with the previous tokens
# METHOD 3
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = wei.softmax(dim=-1)
xbow3 = wei @ x
xbow3

tensor([[[-1.2248e+00,  9.6289e-01, -1.5785e+00,  ...,  7.8247e-01,
          -6.4659e-02, -2.3021e-04],
         [-2.7085e-01,  5.3463e-01, -6.1411e-01,  ..., -1.2781e-01,
          -8.2981e-02, -4.6371e-01],
         [-1.0228e-01,  3.8596e-01, -5.2530e-01,  ..., -4.5352e-01,
          -3.1897e-01, -3.8117e-01],
         ...,
         [-1.1658e-01,  2.5044e-01,  1.7023e-01,  ..., -4.9743e-01,
           4.1533e-01, -1.1010e-01],
         [-1.1365e-01,  3.6672e-01,  3.5860e-01,  ..., -4.9483e-01,
           3.9883e-01,  8.6812e-03],
         [-1.7675e-01,  3.8909e-01,  2.1764e-01,  ..., -4.4764e-01,
           2.7017e-01, -1.5059e-01]],

        [[-2.9485e-01, -2.7986e-01,  1.0837e+00,  ..., -1.1093e-02,
          -9.9528e-01, -2.9935e-01],
         [ 2.3609e-01, -6.0854e-01, -6.2338e-01,  ...,  1.0225e+00,
          -6.1500e-01, -8.2249e-01],
         [-1.4801e-02, -6.3508e-01, -2.5709e-01,  ...,  6.9255e-01,
          -5.9266e-01, -7.1289e-01],
         ...,
         [ 6.2443e-01, -7

In [81]:
torch.manual_seed(42)
a = torch.tril(torch.ones(size=[3,3]))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, size=[3,2]).float()
c = a @ b
print(f'a = \n{a}')
print(f'b = \n{b}')
print(f'c = \n{c}')

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c = 
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [132]:
embed_table = nn.Embedding(T, C)
print(f'X before pos embed:\n {x[0]}')
pos_embed = x + embed_table(torch.arange(T))
print(f'X after pos embed:\n {pos_embed[0]}')

X before pos embed:
 tensor([[-1.2248e+00,  9.6289e-01, -1.5785e+00,  6.7160e-01, -6.0152e-02,
          6.9784e-02, -1.6635e+00, -7.6506e-01,  1.2306e+00,  4.2521e-01,
         -1.6383e-02, -1.0749e-01, -1.3086e+00,  6.5981e-01, -7.0325e-02,
          2.7448e-01, -3.4501e-01, -1.1962e-01,  1.1862e+00, -1.2203e+00,
          2.9100e-01, -7.9642e-02,  1.3200e+00, -1.5197e+00, -2.9336e-01,
          2.1066e+00, -1.0875e-01,  6.0834e-01,  7.8943e-01,  7.8247e-01,
         -6.4659e-02, -2.3021e-04],
        [ 6.8309e-01,  1.0637e-01,  3.5032e-01,  1.2110e-01,  2.9843e-01,
          1.3448e+00,  1.4614e+00,  1.0566e+00,  8.1554e-01, -8.2406e-01,
          8.9328e-01, -3.8688e-01, -3.5718e-01, -1.1568e+00, -1.7660e+00,
         -2.5380e+00,  9.6943e-02, -7.9121e-01,  3.7120e-01,  1.5118e+00,
         -8.9146e-01,  5.2475e-01,  3.5178e-01,  2.4913e-01,  1.1900e+00,
          1.4109e+00,  7.9801e-01,  4.9413e-01, -1.8495e-01, -1.0381e+00,
         -1.0130e-01, -9.2718e-01],
        [ 2.3484e-0

In [149]:
# Implement single self attention head, the hyper parameter involved is the head size
torch.manual_seed(42)

head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
v = value(x)

wei = k @ q.transpose(-1, -2) * head_size**-0.5 # Transpose the last two dimensions k(B, T, 16) @ q(B, 16, T)
tril = torch.tril(torch.ones(T, T)) # For decoder blocks only
wei = wei.masked_fill(tril[:T, :T] == 0, float('-inf')) # For decoder blocks only
wei = wei.softmax(dim=-1)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [150]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4497, 0.5503, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2722, 0.2643, 0.4634, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2855, 0.2142, 0.2455, 0.2548, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1407, 0.1532, 0.2461, 0.2000, 0.2600, 0.0000, 0.0000, 0.0000],
        [0.1482, 0.1193, 0.1536, 0.1219, 0.2351, 0.2219, 0.0000, 0.0000],
        [0.1431, 0.1660, 0.0998, 0.1260, 0.1595, 0.1397, 0.1658, 0.0000],
        [0.0974, 0.1426, 0.0655, 0.1600, 0.0771, 0.1042, 0.2682, 0.0849]],
       grad_fn=<SelectBackward0>)

In [151]:
k, q = torch.randn(B, T, head_size), torch.randn(B, T, head_size)
wei = k @ q.transpose(-1, -2) * head_size**-0.5

In [152]:
k.var()

tensor(0.9402)

In [153]:
q.var()

tensor(0.9806)

In [154]:
wei.var()

tensor(0.9635)