# Bigram Language Model

In [22]:
TEXT_DATA_PATH = "tiny_shiekspear.txt"
with open(TEXT_DATA_PATH, "r") as file:
    text_data = file.read()
    
print(text_data[:500])  # Print the first 500 characters of the text data


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [23]:
chars = sorted(list(set(text_data)))
VOCAB_SIZE = len(chars)

print(f"Unique characters: {''.join(chars)}")
print(f"Vocab size: {VOCAB_SIZE}")

Unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


In [24]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

def encode(text):
    return [stoi[c] for c in text]

def decode(indices):
    return ''.join([itos[i] for i in indices])

In [25]:
encoded = encode("hello!")
print(f"Encoded: {encoded}")
decoded = decode(encoded)
print(f"Decoded: {decoded}")

Encoded: [46, 43, 50, 50, 53, 2]
Decoded: hello!


In [26]:
# convert to torch tensor of encoded text and print
import torch

train_data_tensor = torch.tensor(encode(text_data), dtype=torch.long)
train_data_tensor.shape

torch.Size([1115394])

In [27]:
train_data_tensor[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [28]:
text_data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [29]:
# Train and validation split
TRAIN_SPLIT = 0.9
train_data = train_data_tensor[:int(len(train_data_tensor)*0.9)]
val_data = train_data_tensor[int(len(train_data_tensor)*0.9):]
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [30]:
# Box size = context window
# visualize a block of text (size = block_size + 1)

BLOCK_SIZE = 8
block = train_data[:BLOCK_SIZE + 1]
print(f"Block indices: {block.tolist()}")


Block indices: [18, 47, 56, 57, 58, 1, 15, 47, 58]


In [31]:
# For one given block of text, we generate all possible continuations from the start
# This gives us block_size examples that we can use to train the model

X = train_data[: BLOCK_SIZE]
y = train_data[1:BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    

For an input block: [18, 47, 56, 57, 58, 1, 15, 47]
For input: [18], Next token would be: 47
For input: [18, 47], Next token would be: 56
For input: [18, 47, 56], Next token would be: 57
For input: [18, 47, 56, 57], Next token would be: 58
For input: [18, 47, 56, 57, 58], Next token would be: 1
For input: [18, 47, 56, 57, 58, 1], Next token would be: 15
For input: [18, 47, 56, 57, 58, 1, 15], Next token would be: 47
For input: [18, 47, 56, 57, 58, 1, 15, 47], Next token would be: 58


In [32]:
# To generate a batch of example tensor using the block code above:
random_idx = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (1,)).item()
X = train_data[random_idx: random_idx+BLOCK_SIZE]
y = train_data[random_idx+1:random_idx+BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    
    

For an input block: [58, 43, 50, 50, 11, 0, 14, 59]
For input: [58], Next token would be: 43
For input: [58, 43], Next token would be: 50
For input: [58, 43, 50], Next token would be: 50
For input: [58, 43, 50, 50], Next token would be: 11
For input: [58, 43, 50, 50, 11], Next token would be: 0
For input: [58, 43, 50, 50, 11, 0], Next token would be: 14
For input: [58, 43, 50, 50, 11, 0, 14], Next token would be: 59
For input: [58, 43, 50, 50, 11, 0, 14, 59], Next token would be: 58


In [33]:
# Stacking
# To generate a batch of example tensor using the block code above:
BATCH_SIZE = 4

indices = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (BATCH_SIZE,))
X = torch.stack([train_data[idx: idx+BLOCK_SIZE] for idx in indices])
y = torch.stack([train_data[idx+1: idx+BLOCK_SIZE+1] for idx in indices])

print(X)
print(y)

tensor([[ 1, 21,  1, 42, 53,  1, 58, 46],
        [58,  1, 52, 53, 58,  1, 59, 57],
        [43, 39, 50,  1, 42, 39, 47, 50],
        [44,  1, 63, 53, 59,  1, 46, 43]])
tensor([[21,  1, 42, 53,  1, 58, 46, 39],
        [ 1, 52, 53, 58,  1, 59, 57,  1],
        [39, 50,  1, 42, 39, 47, 50, 63],
        [ 1, 63, 53, 59,  1, 46, 43, 50]])


In [34]:
# This X and y batch are both (B,T) matrices, 
#   where the B dim is the blocks across a BATCH
#   T dim is across a block through TIME
# For any given block Bx, for any given sequence Bx[start:end], the next token in the sequence would be By[end]
for block in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        print(f"For input tensor: {X[block, :t+1]}, Target is: {y[block, t]}")

For input tensor: tensor([1]), Target is: 21
For input tensor: tensor([ 1, 21]), Target is: 1
For input tensor: tensor([ 1, 21,  1]), Target is: 42
For input tensor: tensor([ 1, 21,  1, 42]), Target is: 53
For input tensor: tensor([ 1, 21,  1, 42, 53]), Target is: 1
For input tensor: tensor([ 1, 21,  1, 42, 53,  1]), Target is: 58
For input tensor: tensor([ 1, 21,  1, 42, 53,  1, 58]), Target is: 46
For input tensor: tensor([ 1, 21,  1, 42, 53,  1, 58, 46]), Target is: 39
For input tensor: tensor([58]), Target is: 1
For input tensor: tensor([58,  1]), Target is: 52
For input tensor: tensor([58,  1, 52]), Target is: 53
For input tensor: tensor([58,  1, 52, 53]), Target is: 58
For input tensor: tensor([58,  1, 52, 53, 58]), Target is: 1
For input tensor: tensor([58,  1, 52, 53, 58,  1]), Target is: 59
For input tensor: tensor([58,  1, 52, 53, 58,  1, 59]), Target is: 57
For input tensor: tensor([58,  1, 52, 53, 58,  1, 59, 57]), Target is: 1
For input tensor: tensor([43]), Target is: 39


In [35]:
# get batch -> x, y for a random batch (of blocks)
# Note 1

def get_batch(data, batch_size, block_size):
    indices = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    X = torch.stack([data[idx: idx+block_size] for idx in indices])
    y = torch.stack([data[idx+1: idx+block_size+1] for idx in indices])
    
    return X, y

get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)

(tensor([[32, 21, 27, 10,  0, 13, 56, 43],
         [39, 57, 58, 47, 51, 43,  1, 58],
         [23, 17,  1, 27, 18,  1, 37, 27],
         [ 5,  1, 59, 52, 58, 53,  1, 58]]),
 tensor([[21, 27, 10,  0, 13, 56, 43,  1],
         [57, 58, 47, 51, 43,  1, 58, 53],
         [17,  1, 27, 18,  1, 37, 27, 30],
         [ 1, 59, 52, 58, 53,  1, 58, 46]]))

In [None]:
# Bigram language model that uses nn.Embedding to generate Logits, cross entropy loss ### Comeback and try making a trigram model too
# Check dimenstionality of input and output
# Note 2
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(input_vocab_size, output_vocab_size)

        # nn.embedding is essentially a matrix,
        # I don't know how it works
        # But here's my guess
        # It is a (input_values, output_values) matrix, where the input token equals the possible input variation
        # In a bigram model, where we look at one past character to predict the next one, input_values = output_values = charater_set_length
        # In a trigram model, where we look at two past characters to predict on next character, output_value = charater_set_length, input_values = charater_set_length ** 2
        # In this trigram model, the embedding matric would me (charater_set_length ** 2, charater_set_length)
        # In LLM, I am guessing this is a (vocab_size, vocab_size) matrix
        
        
        # Now as to what it does, the forward pass using this as a lookup table of probabilities,
        # For any given row, we treat the column values as the probabilities of which output token should come next
        # This way when we back prop, we are optimizing for the lookup table to resemble a probability distribution matrix as done is makemore
        
        # Apparently I am precisely correct about how nn.Embedding does. This might as well have been implemented with a normal (input_values, output_values) Tensor with gradients and it would've worked just the same


    def forward(self, idx, target=None):
        # x is shape (B, T), where B is batch size, T is time steps (or block size)
        x = self.embedding_lookup_table(idx)
        # x is now (B, T, C) where C is the character set size (vocab size)
        if not target:
            return x, None
        return x , F.cross_entropy()
            


In [37]:
# for the forward pass to work, cross_entropy expects input, target. BUTTTTT Andrej mentioned something the input from B,T,C to B,C, T, will run and test
model = BigramLanguageModel(VOCAB_SIZE, VOCAB_SIZE)

X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)

In [38]:
# Pass one token, which is the last token of the input sequence, because this is a bigram model
# But the input sequnce in a moving window, as we printed above
# So the actual lookup indices should be a (B*BLOCK_SIZE, C)
# But I don't know how to implement that, so I am justs going to get one example per block for the sack of making the forward pass work

print(model.embedding_lookup_table(X[:, -1]).shape), print(y[:, -1].shape)
# This gives me (BATCH_SIZE, VOCAB_SIZE), where the 65 is the vocab size logtis that I am assuming are the probabilites of what the next token should be

torch.Size([4, 65])
torch.Size([4])


(None, None)

In [39]:
# torch.distributions.Polynomial() # I don't know how this will work in pytorch, but I am guessing after we get the embedding logits,
# We we sample for a distribution of that embedding, which will give us the index of the next token, that will be our actual output

In [40]:
seq = model.embedding_lookup_table(X[:, -1])
target = y[:, -1]
F.cross_entropy(seq, target)

# This works, but again, this is (B, C) against (B), I have no idea how to make this work in (B, T, C) against (B, T)

tensor(4.7363, grad_fn=<NllLossBackward0>)

In [41]:
# Oh, I keep struggling because I keep trying to write the most general solution ever
# This is a bigram model, model.embedding_lookup_table(X) for a (B, T) matrix will give me (B, T, C). This is because in bigram models, only the current token is needed to predict the next token
# No need to do the whole generate completel sequence thing. This will not work for Trigram, or atleast not exactly like this
print(model.embedding_lookup_table(X).shape), print(y.shape)


torch.Size([4, 8, 65])
torch.Size([4, 8])


(None, None)

In [42]:
seq = model.embedding_lookup_table(X)
target = y
F.cross_entropy(seq, target)

# F.cross_entropy(seq, target) given an error, RuntimeError: Expected target size [4, 65], got [4, 8]
# I am guessing this is because, my target is indices, not probablity values to match with. So cross entropy thinks that in my input (B, T, C). T is the probabilities, and C is the classes.
# essentially (batch, probability, class)
# So for this I will transform seq but only the two inner dimensions, so that is goes from (B, T, C) to (B, C, T)

RuntimeError: Expected target size [4, 65], got [4, 8]

In [None]:
seq.transpose(-2, -1).shape
# Looks good

torch.Size([4, 65, 8])

In [None]:
F.cross_entropy(seq.transpose(-2, -1), target)

tensor(4.8080, grad_fn=<NllLoss2DBackward0>)

In [None]:
# Boom!

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        # This is how I did it:
        # x = self.embedding_lookup_table(idx)
        # if target is None:
        #     return x, None
        # return x , F.cross_entropy(x.transpose(-2, -1), target)

        # this is how Andrej did it
        logits = self.embedding_lookup_table(idx)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = target.view(B*T)
        loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def my_forward(self, idx, target=None):
        x = self.embedding_lookup_table(idx)
        if target is None:
            return x, None
        return x , F.cross_entropy(x.transpose(-2, -1), target)

X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
model = BigramLanguageModel(VOCAB_SIZE)
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

# Well my and Andrej's methods are giving different results.... welp lemme chatgpt what is the differenct


(torch.Size([32, 65]), torch.Size([]))

In [None]:
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

(torch.Size([32, 65]), tensor(4.6674, grad_fn=<NllLossBackward0>))

In [None]:
y_pred , loss = model.my_forward(X, y)
y_pred.shape, loss

(torch.Size([4, 8, 65]), tensor(4.6674, grad_fn=<NllLoss2DBackward0>))

In [None]:
# I have no idea what changed (Edit: I do now, different batch in the first run), but by implementing them both, very clearly they are matehmatically equivalent
# I'm a genius!
# Look Ma! No hands!
# Now let's cleanup, and implement a max_token limited generate function to let the model babble

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        x = self.embedding_lookup_table(idx)
        if target is None:
            return x, None
        return x , F.cross_entropy(x.transpose(-2, -1), target)

    def generate(self, idx, max_new_tokens): # idx is the start seed token
        # idx is [B, T], ideally where T=1, so a batch of input seed tokens
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # Generate logits for what should come next for each B
            logits = logits[:, -1,:]
            probs = F.softmax(logits, dim=1) # We softmax the logits to convert them into probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # We pick the next token from a distribution 
            idx = torch.cat((idx, idx_next), dim = 1) # We concatenate the newly generated token to the end of the starting sequence
        return idx



X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
model = BigramLanguageModel(VOCAB_SIZE)
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

(torch.Size([4, 8, 65]), tensor(4.3515, grad_fn=<NllLoss2DBackward0>))

In [None]:
print(decode(model.generate(torch.zeros(1, 1, dtype=torch.long), 1000)[0].tolist()))


Hm!pBQGR
h:
kqX3I!oMbB&TSsDMIizI!rF
xjpAwI-Ry.GjLGwatYyBquxjFOBYZeOEYFObrQOB&Lr-fK&tu-yTeB3U,SfnjGYF&'GN!$OBTZ 3V hfoQOBTpd'tpA-&w-ctxbmDdW:W$I'cHdlfPJolw!Ii.CnuB!ooCuWoqM?R-!B.anOB3o$pNLjYF'EEYUtPnM;OwNEJSdkMt3u'Z;OBLTSFNDk:v,;&fk:vpvrhiEAx.ddoyc!jAy.'tRY;OBky $Vv;UpdjMtxSuTthbaTH oZhy!?LQY;OMc ;OB!Ikgx
&TOBvEoB&FKoleMGYVK
G:gPb-yGthAqfoU,OBTZelR&OM?t&MUvLqN'vvbsv.Gs;hUNJ
ZTlSBzgpH!J&DS&U,qUvFAo3ErGgjai.
v SjfI3Id;,sDYfPxVQF'.yd'JSuk:wICpO BOJApdPOgwjM?W:t'Qy.&hank:VXbPMI!Wsm-,Krhcl?VY$aUu?ulyo&g'P,Lin3JGjdXYXNE$lfTI'Q;SuvlSdBu!glbf'w;jGR3MlwmbMbBOltzVyA,SB-BO;vJ3ILrypDkAelgD'fo &:.FVnD:anjrhnM3lw!,,weuVWgKoRRqahynDYEQTy sd-lw3$fZfjCux?qdkZm!-sszzmTRvvi$n;
Zx
Q?iRPahuQ;OBgxwevAbNkYFZuF!m!vb
;OBhEOB3Iu tZtuDeYYZfr-GtwbaqfgMZANkazuVXYFsvv.FPFIV'wNEiSXBth,Bvm,tD.PRyUbowI$NSPHD
;cK
Rq-' nCHvb' xXpavJ$r-ouA3JYWbMGAHe' yLNYUc
khcscsPnBHminrh,SLQTE&tZnNEoe?QFjrfoM;FsV3tzgX:;OBYkAYByNE;
Wk!AH$qvbIdXAU
$z.EAYvbE'hiSPKNkkgqpe'tsxqZbGv,Sdg:gvTbOUbTLyGfYxFTlxpELw?EYesFZOcMdiY3lDJVeSuN'eYFrF,wftf

In [None]:
# Rubbish, but expected because the model is untrained. Let's train!
model = BigramLanguageModel(VOCAB_SIZE)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32


for steps in range(10000):

    xb, yb = get_batch(train_data, batch_size, BLOCK_SIZE)
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.4207818508148193


In [None]:
print(decode(model.generate(torch.ones(1, 1, dtype=torch.long), 1000)[0].tolist()))

 m fo, g thr.
DWhirdrind Vaxce? fathe ar ie VEThe fo in my: Whe GEENESSThat
D:
Ithay ee s e:
Singlithe angrs,
Yorou

GHarierel magho than uryootofrdin, m me e blofonof.
Thers onk thought thearessk, nang le;
Finceilerisbu orth I fer s thistheaven.
A:
Waigsuralairarwhinomanstt se fongheenon fr. d t berve'eck y loueldind hirelaimucknar I thoupoy N:

Yo umy sw,
thaecty yo won, alirshigorthe storirn hinto 'lise hbbouttlthe,
USh-the?
An sthe ly.

BULTerin Wes p thonwaltovouberon f siery sirofodsucthe dre ows y ot then.
uterorthiviggobour an.


Shasifoofoinoutitins hindi'd hin iusithathaped fullalofo, he'sthote y'ORINGHalle bu hech ten Gllilst my, marron'shehee,
Priate theses t yowich ve lerenerer t pue


ID nwac foushendorof home th at.
ARO ear gin bl hese,
FOFiree

ABrine llo knge Fob ashiabers tin'sowh ftof o, M:
Helo lthese ISe:
Ork theth hay t:
YOfren foend horsofod.
Tit cr or d d pun:
METINRKERDanngeawaclita nn'su orghad I waighefin!
NGENond 'lar ougorir my ke yor
COLONTotho te d,
QUS:


In [None]:
# Looks good enough, let'smove this over to transformer_dev.py

# Trigram Language Model

In [117]:
import torch
from torch import nn
import torch.nn.functional as F
import time

# Parameters
TEXT_DATA_PATH = "tiny_shiekspear.txt"
VOCAB_SIZE = 65
TRAIN_SPLIT = 0.9
BLOCK_SIZE = 8
BATCH_SIZE = 64
MAX_NEW_TOKENS = 1000
EPOCHS = 1000
LEARNING_RATE = 1e-3
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# DEVICE = torch.device("cpu")
with open(TEXT_DATA_PATH, "r") as file:
    text_data = file.read()

chars = sorted(list(set(text_data)))
# Also add all character pairs into the encoder and decoder to be able to make a trigram language model
vocab = [a+b for a in chars for b in chars]
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}

input_vocab_size = len(vocab)
output_vocab_size = len(chars)


In [118]:
def encode(text):
    return [stoi[text[idx : idx + 2]] for idx in range(len(text) - 1)]

def decode(indices):
    output = [itos[indices[0]]]
    output += [itos[idx][1] for idx in indices[1:]]
    return ''.join(output)

In [119]:
encode("Hello")

[1343, 2845, 3300, 3303]

In [120]:
decode(encode("Hello"))

'Hello'

In [121]:
# convert to torch tensor of encoded text and print
import torch

train_data_tensor = torch.tensor(encode(text_data), dtype=torch.long)
train_data_tensor.shape

torch.Size([1115393])

In [122]:
train_data_tensor[:100]

tensor([1217, 3111, 3697, 3763, 3771,   80, 1022, 3113, 3817, 3119, 4203, 2847,
        3390,  650,   14,  953, 2839, 2913, 3501, 3683, 2796,  126, 4008, 2796,
         119, 3566, 3693, 3486, 2708, 2838, 2837, 2731,  104, 2587, 3443, 4096,
         109, 2919, 3891, 3698, 3816, 3033, 2851, 3646,  391,  111, 3033, 2834,
        2591, 3641,  116, 3358, 2796,  122, 3759, 3553, 2834, 2584, 3193,  520,
           0,   13,  895, 3300, 3260,  650,   31, 2069, 3553, 2834, 2584, 3191,
         391,  122, 3759, 3553, 2834, 2584, 3193,  520,    0,   18, 1217, 3111,
        3697, 3763, 3771,   80, 1022, 3113, 3817, 3119, 4203, 2847, 3390,  650,
          37, 2458, 3504, 3836])

In [123]:
text_data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [124]:
# Train and validation split
TRAIN_SPLIT = 0.9
train_data = train_data_tensor[:int(len(train_data_tensor)*0.9)]
val_data = train_data_tensor[int(len(train_data_tensor)*0.9):]
train_data.shape, val_data.shape

(torch.Size([1003853]), torch.Size([111540]))

In [125]:
# Box size = context window
# visualize a block of text (size = block_size + 1)

BLOCK_SIZE = 8
block = train_data[:BLOCK_SIZE + 1]
print(f"Block indices: {block.tolist()}")


Block indices: [1217, 3111, 3697, 3763, 3771, 80, 1022, 3113, 3817]


In [126]:
# For one given block of text, we generate all possible continuations from the start
# This gives us block_size examples that we can use to train the model

X = train_data[: BLOCK_SIZE]
y = train_data[1:BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    

For an input block: [1217, 3111, 3697, 3763, 3771, 80, 1022, 3113]
For input: [1217], Next token would be: 3111
For input: [1217, 3111], Next token would be: 3697
For input: [1217, 3111, 3697], Next token would be: 3763
For input: [1217, 3111, 3697, 3763], Next token would be: 3771
For input: [1217, 3111, 3697, 3763, 3771], Next token would be: 80
For input: [1217, 3111, 3697, 3763, 3771, 80], Next token would be: 1022
For input: [1217, 3111, 3697, 3763, 3771, 80, 1022], Next token would be: 3113
For input: [1217, 3111, 3697, 3763, 3771, 80, 1022, 3113], Next token would be: 3817


In [127]:
# To generate a batch of example tensor using the block code above:
random_idx = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (1,)).item()
X = train_data[random_idx: random_idx+BLOCK_SIZE]
y = train_data[random_idx+1:random_idx+BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    
    

For an input block: [2847, 3381, 126, 4008, 2838, 2849, 3557, 3107]
For input: [2847], Next token would be: 3381
For input: [2847, 3381], Next token would be: 126
For input: [2847, 3381, 126], Next token would be: 4008
For input: [2847, 3381, 126, 4008], Next token would be: 2838
For input: [2847, 3381, 126, 4008, 2838], Next token would be: 2849
For input: [2847, 3381, 126, 4008, 2838, 2849], Next token would be: 3557
For input: [2847, 3381, 126, 4008, 2838, 2849, 3557], Next token would be: 3107
For input: [2847, 3381, 126, 4008, 2838, 2849, 3557, 3107], Next token would be: 3425


In [128]:
# Stacking
# To generate a batch of example tensor using the block code above:
BATCH_SIZE = 4

indices = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (BATCH_SIZE,))
X = torch.stack([train_data[idx: idx+BLOCK_SIZE] for idx in indices])
y = torch.stack([train_data[idx+1: idx+BLOCK_SIZE+1] for idx in indices])

print(X)
print(y)

tensor([[2796,  111, 3043, 3496, 3358, 2803,  520,    0],
        [3816, 3029, 2593, 3771,  128, 4148, 3504, 3836],
        [3228, 2796,  123, 3823, 3446,  111, 3037, 3106],
        [2851, 3683, 2795,   52, 3433, 3497, 3423, 2796]])
tensor([[ 111, 3043, 3496, 3358, 2803,  520,    0,   13],
        [3029, 2593, 3771,  128, 4148, 3504, 3836,  123],
        [2796,  123, 3823, 3446,  111, 3037, 3106, 3327],
        [3683, 2795,   52, 3433, 3497, 3423, 2796,  118]])


In [129]:
# This X and y batch are both (B,T) matrices, 
#   where the B dim is the blocks across a BATCH
#   T dim is across a block through TIME
# For any given block Bx, for any given sequence Bx[start:end], the next token in the sequence would be By[end]
for block in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        print(f"For input tensor: {X[block, :t+1]}, Target is: {y[block, t]}")

For input tensor: tensor([2796]), Target is: 111
For input tensor: tensor([2796,  111]), Target is: 3043
For input tensor: tensor([2796,  111, 3043]), Target is: 3496
For input tensor: tensor([2796,  111, 3043, 3496]), Target is: 3358
For input tensor: tensor([2796,  111, 3043, 3496, 3358]), Target is: 2803
For input tensor: tensor([2796,  111, 3043, 3496, 3358, 2803]), Target is: 520
For input tensor: tensor([2796,  111, 3043, 3496, 3358, 2803,  520]), Target is: 0
For input tensor: tensor([2796,  111, 3043, 3496, 3358, 2803,  520,    0]), Target is: 13
For input tensor: tensor([3816]), Target is: 3029
For input tensor: tensor([3816, 3029]), Target is: 2593
For input tensor: tensor([3816, 3029, 2593]), Target is: 3771
For input tensor: tensor([3816, 3029, 2593, 3771]), Target is: 128
For input tensor: tensor([3816, 3029, 2593, 3771,  128]), Target is: 4148
For input tensor: tensor([3816, 3029, 2593, 3771,  128, 4148]), Target is: 3504
For input tensor: tensor([3816, 3029, 2593, 3771, 

In [130]:
# get batch -> x, y for a random batch (of blocks)
# Note 1

def get_batch(data, batch_size, block_size):
    indices = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    X = torch.stack([data[idx: idx+block_size] for idx in indices])
    y = torch.stack([data[idx+1: idx+block_size+1] for idx in indices])
    
    return X, y

get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)

(tensor([[3107, 3425, 2926,  123, 3816, 3046, 3693, 3504],
         [2585, 3300, 3251,   86, 1366,  105, 2656, 3687],
         [2837, 2731,  118, 3497, 3421, 2708, 2801,  391],
         [2097, 1135, 1960,  650,   35, 2321, 3029, 2593]]),
 tensor([[3425, 2926,  123, 3816, 3046, 3693, 3504, 3880],
         [3300, 3251,   86, 1366,  105, 2656, 3687, 3107],
         [2731,  118, 3497, 3421, 2708, 2801,  391,  126],
         [1135, 1960,  650,   35, 2321, 3029, 2593, 3776]]))

In [136]:
import torch
from torch import nn
import torch.nn.functional as F

class TrigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        x = self.embedding_lookup_table(idx)
        if target is None:
            return x, None
        return x , F.cross_entropy(x.transpose(-2, -1), target)

    def generate(self, idx, max_new_tokens): # idx is the start seed token
        # idx is [B, T], ideally where T=1, so a batch of input seed tokens
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # Generate logits for what should come next for each B
            logits = logits[:, -1,:]
            probs = F.softmax(logits, dim=1) # We softmax the logits to convert them into probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # We pick the next token from a distribution 
            idx = torch.cat((idx, idx_next), dim = 1) # We concatenate the newly generated token to the end of the starting sequence
        return idx

X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
model = TrigramLanguageModel(len(vocab))
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

(torch.Size([4, 8, 4225]), tensor(8.8089, grad_fn=<NllLoss2DBackward0>))

In [137]:
encode("\n\n")

[0]

In [142]:
print(decode(model.generate(torch.zeros(1,1, dtype=torch.long), max_new_tokens=100).tolist()[0]))



ZuHxQPIrzla3AYLWOXSCapdF!UzvBMZ,rNpsJjfKEMSmqeX&fPX$QVeuR?vySE gpjN,QnP&EoVRlLKCltczxPIq-.WgLYCx NlN


# Self-Attention

In [18]:
import torch

B, T, C = 4, 8, 2

#x = torch.randn(B, T, C)
x = torch.randint(0, 10, (B, T, C)).float()
x.shape

torch.Size([4, 8, 2])

In [None]:
# Pass informtion from (T-1), (T-2)... (T-N) for any given Token (T), such that no information diffuse from future tokens to current token
xbow = torch.zeros((B,T,C))

for b in range(B):
    for t in range(T):
        xbow_prev = x[b, :t+1]
        xbow[b,t] = torch.mean(xbow_prev, dim=0)

In [29]:
x[0]

tensor([[7., 4.],
        [1., 1.],
        [0., 7.],
        [1., 4.],
        [1., 3.],
        [5., 3.],
        [8., 4.],
        [8., 4.]])

In [30]:
xbow[0]

tensor([[7.0000, 4.0000],
        [4.0000, 2.5000],
        [2.6667, 4.0000],
        [2.2500, 4.0000],
        [2.0000, 3.8000],
        [2.5000, 3.6667],
        [3.2857, 3.7143],
        [3.8750, 3.7500]])

In [36]:
# But that nested for loop is very slow in python. So we need some matrixy trick to get there.
# Using a lower triangular matrix of ones, multiplied with our original matrix x, gives us cumulative sums going down the T dim

wei =torch.tril(torch.ones(T, T))
print(wei)

(wei @ x)[0]


tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


tensor([[ 7.,  4.],
        [ 8.,  5.],
        [ 8., 12.],
        [ 9., 16.],
        [10., 19.],
        [15., 22.],
        [23., 26.],
        [31., 30.]])

In [41]:
wei.sum(dim = 0, keepdim=True)

tensor([[8., 7., 6., 5., 4., 3., 2., 1.]])

In [57]:
# Now to get cumulaive averages instead of cum-sums, we just need to normalize the tril matrix across rows
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [58]:
(wei @ x)[0]

tensor([[7.0000, 4.0000],
        [4.0000, 2.5000],
        [2.6667, 4.0000],
        [2.2500, 4.0000],
        [2.0000, 3.8000],
        [2.5000, 3.6667],
        [3.2857, 3.7143],
        [3.8750, 3.7500]])

In [59]:
xbow[0]

tensor([[7.0000, 4.0000],
        [4.0000, 2.5000],
        [2.6667, 4.0000],
        [2.2500, 4.0000],
        [2.0000, 3.8000],
        [2.5000, 3.6667],
        [3.2857, 3.7143],
        [3.8750, 3.7500]])

In [61]:
# works!
# more emperically
torch.allclose(wei @ x, xbow)

True

In [66]:
# Another method to do this is, using softmax
import torch
import torch.nn.functional as F

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim = 1)
(wei @ x)[0]


tensor([[7.0000, 4.0000],
        [4.0000, 2.5000],
        [2.6667, 4.0000],
        [2.2500, 4.0000],
        [2.0000, 3.8000],
        [2.5000, 3.6667],
        [3.2857, 3.7143],
        [3.8750, 3.7500]])

In [68]:
"""
# Positional embeddings
Before implementing self attention, a few things need to be imlemented
1. In the bigram model, the embedding module is being used as a probability lookup table. It should be used as an embedding layer, where the embedding output 
    is a feature vector of length embedding_length, which a linear layer than converts into vocab size logits. This vector will represent what a certain vector means.
2. We will create a second embedding module of size (block_size, embedding length). This will be used as position embedding, so it gives us the same vector for a certain position of 
    a token, regardless of its value. This vector will represent where in a sentence does it exist.
3. We will Add both of these vectors togethere before passing them to the linear layer.


# Self Attention:
Every token will emit two vector: Query and key
1. Query represents what I am looking for
2. Key represent what do I contain

For any given token T, We will dot product the query[T] with the key of all other tokens. 
In case of an encoder, this will be query[T] * (key[:T] and key[T+1:])
In case of a decoder, we donot leak information from future tokens, so for that query[T] * key[:T]
During training, this decoder quirk will be implementeed using the tril technique above.

SOOOO to summarize:
(B, T) is the input matrix, where a B row is a block and a T column is a token
For a given T token,
x = positions_embeddings[T] + token_embeddings[T]
wei = query[T] _dot_product_ key[:T]
wei = trill_masking(wei)

y = wei @ x
Which we then pass to a Linear layer
output = Softmax(y_linear)

SOOO to summarize. For Self Attention, for any given token, to predict the next token, I need to know:
1. What this token represents (Token Embeddings)
2. Where in a block this token resides (Position Embeddings)
3. What this token needs from all other tokens (Query)
4. What all other tokens before T have (keys[:T])
5. What this token will communicate if other token find it interesting (values[T])
"""

'\n# Positional embeddings\nBefore implementing self attention, a few things need to be imlemented\n1. In the bigram model, the embedding module is being used as a probability lookup table. It should be used as an embedding layer, where the embedding output \n    is a feature vector of length embedding_length, which a linear layer than converts into vocab size logits. This vector will represent what a certain vector means.\n2. We will create a second embedding module of size (block_size, embedding length). This will be used as position embedding, so it gives us the same vector for a certain position of \n    a token, regardless of its value. This vector will represent where in a sentence does it exist.\n3. We will Add both of these vectors togethere before passing them to the linear layer.\n\n\n# Self Attention:\nEvery token will emit two vector: Query and key\n1. Query represents what I am looking for\n2. Key represent what do I contain\n\nFor any given token T, We will dot product th

In [72]:
# So, let's implement one self-attention head

import torch
from torch import nn
import torch.nn.functional as F

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Attention head
head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, head_size)
q = key(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim = 1)
out = wei @ x
out.shape

torch.Size([4, 8, 32])

In [216]:
# So, Adding value linear layer too instead of using the raw x values

import torch
from torch import nn
import torch.nn.functional as F

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Attention head
head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, 65, bias=False)
k = query(x) # (B, T, head_size)
q = key(x) # (B, T, head_size)
v = value(x)

wei = q @ k.transpose(-2, -1)  * C **-0.5 # (B, T, T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril[:T, :T] == 0, float("-inf"))
wei = F.softmax(wei, dim = -1)
out = wei @ v
out.shape

torch.Size([4, 8, 65])

In [217]:
v.shape

torch.Size([4, 8, 65])

In [218]:
wei.shape

torch.Size([4, 8, 8])

In [200]:
nn.Linear(32, 65)(out)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x16 and 32x65)

In [130]:
wei.std()

tensor(0.2143, grad_fn=<StdBackward0>)

In [12]:
# putting it all together

import time

import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

# Parameters
TEXT_DATA_PATH = "tiny_shiekspear.txt"
TRAIN_SPLIT = 0.9
VOCAB_SIZE = 65
BLOCK_SIZE = 8

BATCH_SIZE = 4
N_EMBED = 32
HEAD_SIZE = 16

EPOCHS = 30000
EVAL_ITERS = 100
EVAL_INTERVAL = 250
LEARNING_RATE = 1e-3

MAX_NEW_TOKENS = 1000

#DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE = torch.device("cpu")

with open(TEXT_DATA_PATH, "r") as file:
    text_data = file.read()

chars = sorted(list(set(text_data)))
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


def encode(text):
    return [stoi[c] for c in text]


def decode(indices):
    return "".join([itos[i] for i in indices])


def get_batch(data, batch_size, block_size):
    indices = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    X = torch.stack([data[idx : idx + block_size] for idx in indices])
    y = torch.stack([data[idx + 1 : idx + block_size + 1] for idx in indices])
    return X, y


@torch.no_grad()
def estimate_loss(model: nn.Module, eval_iters, train_data, val_data):
    out = {}
    model.eval()

    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = (
                get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
                if split == "train"
                else get_batch(val_data, BATCH_SIZE, BLOCK_SIZE)
            )
            logits, loss = model(X, y)

            losses[k] = loss

        out[split] = losses.mean()
    model.train()
    return out


train_data_tensor = torch.tensor(encode(text_data), dtype=torch.long).to(DEVICE)
train_data = train_data_tensor[: int(len(train_data_tensor) * TRAIN_SPLIT)]
val_data = train_data_tensor[int(len(train_data_tensor) * TRAIN_SPLIT) :]


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, block_size, n_embed, head_size) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embed = n_embed

        self.token_embeddings = nn.Embedding(vocab_size, n_embed)
        self.positional_embeddings = nn.Embedding(block_size, n_embed)

        self.queries = nn.Linear(n_embed, head_size, bias=False)
        self.keys = nn.Linear(n_embed, head_size, bias=False)
        self.values = nn.Linear(n_embed, head_size, bias=False)
        self.proj = nn.Linear(head_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size) 
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x, targets=None):
        B, T = x.shape
        token_emb = self.token_embeddings(x)  # (B, T, C)
        position_emb = self.positional_embeddings(
            torch.arange(T)
        )  # (T, C)

        x = token_emb + position_emb

        q = self.queries(x)  # (B, T, C)
        k = self.keys(x)  # (B, T, C)
        v = self.values(x)  # (B, T, C)

        wei = q @ k.transpose(-2, -1) * self.n_embed **-0.5  #  (B, T, T)
        # In the case of an encoder, this masked_fill will not be here, so that all tokens can freely communicate with each_other
        wei = wei.masked_fill(
            self.tril[:T, :T] == 0, float("-inf")
        )  # tril will be braodcaseted in the following way
        # (B, T, T) and  (T, T) need broadcasting
        # Pytorch will right align them
        # (B, T, T)
        #    (T, T)
        # And it will then add a new dim
        # (B, T, T)
        # (1, T, T)
        # and then broadcast
        # (B, T, T)
        # (B, T, T)
        wei = F.softmax(wei, dim=-1)
        # Use the values instead of default private token encodings
        x = wei @ v # (B, T, T) @ (B, T, C)

        x = self.proj(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):

            idx_cond = idx[
                :, -self.block_size :
            ]  # Because we can only pass in the last block, This is where the context window limit comes from for transformer models

            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [13]:
model = BigramLanguageModel(VOCAB_SIZE, BLOCK_SIZE, N_EMBED, HEAD_SIZE).to(DEVICE)
model

BigramLanguageModel(
  (token_embeddings): Embedding(65, 32)
  (positional_embeddings): Embedding(8, 32)
  (queries): Linear(in_features=32, out_features=16, bias=False)
  (keys): Linear(in_features=32, out_features=16, bias=False)
  (values): Linear(in_features=32, out_features=16, bias=False)
  (proj): Linear(in_features=16, out_features=32, bias=True)
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
)

In [14]:
X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
X.shape, y.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [15]:
logits, loss = model(X, y)
loss, logits.shape

(tensor(4.1742, grad_fn=<NllLossBackward0>), torch.Size([32, 65]))

In [16]:
# Works! Integrating this to the bigram file

In [17]:
print(
    decode(
        model.generate(torch.ones(1, 1, dtype=torch.long), MAX_NEW_TOKENS)[0].tolist()
    )
)


 u Slui3NHyM:
.Rb;&l:UO?.&NVCd?FZfRCdKW?Wy? W;iReZE!Tmny!v'
3KhY,YFFEoORgEHkYnJQIi;q-HqPoG!'lKNvyNa3iAtbzu hzamCKBzNKUu;3Y
h?JKTS; $WZbNniYri vKRjj
NHr?jquCn!JKn$sCLf-AI.L
YCXDaF&UxngT,nqkCHPQbBvxsj::YU!PjlGMBM-fKR sZJ;;KktpPdVhmnI&E? x-CFDvUJB,TJs$q -VnLCAfvV,xLWFvmvHOh!EGAsB3BsWeVlJDRJUUnbaYQ:hOR-KJ-AtOzrGtGqGZ?&WcsWvLYOBe
lVZMStd'kdFjc&emF-OdoeEk.ewwqQQJ,WDmJvxU?G&ejxMaZWrE 'lhu,.YMeBOYAUV!BDbFjoCHafxz.zWe-!CI?YgXgtDWDOsfc?vjqHVwl'uouiNoqObKsqDGJmCgNHo'F:ik&qwn?rLluJVhbGR
.-'uxHv,-vb;nvpr-K:J;s&PhDSwSzitGG?lrRWbnGXQ!yrHhFsu.
Xd3hWRJwM$Mh:KeQAsDktgVs$iFMRgrXz$J'$MxpKsxL;hLGEw!.oKSaqXZuO$k,?KvrwvK;lwv
yNHqrenHaiEt-YaV:WSmflN-votidMOgGoyHaL3RLirfa!IZ MX-jtsWtIlkhZbfqpF-?JIUPkV,Q:$usmN&GxI ewZstkQeKUBZ-SQD:CU3Jr':-UhCKcjMijDT
dWL,RrGoS$YYArdTnn VVNs;C-L
xI
RmM&ZZ Xqf
JoWj'LwjlkxvFaJRLUNM 3OHjP
Bsyrfx;ibIym:vQ?.N$Z'wJ$zb PTlmc$O:IDwISGl,Qjpq?XAjXfRztFIL&UftMPSjywjoRfl3e!RC
LK
mJb ;gIUqalBpK'EoDXJs3iVcQBH?33

Ycf?hax':McnkLplTgfiWl$rQCVzkJ YQt!ZpiTT$L JJT'Yvtrw:pfw&WRRZLAAkSiRbLnrkeMGRmh.

# Multi-head Attention:

In [None]:
# All we need to do to implement multi-head-attention is to rewrite the bigram-language-model--with-self-attention file such that the language model and the head are separate classes
# And then we run miltiple heads in parallel, and then use scaled do