In [14]:
TEXT_DATA_PATH = "tiny_shiekspear.txt"
with open(TEXT_DATA_PATH, "r") as file:
    text_data = file.read()
    
print(text_data[:500])  # Print the first 500 characters of the text data


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [78]:
chars = sorted(list(set(text_data)))
VOCAB_SIZE = len(chars)

print(f"Unique characters: {''.join(chars)}")
print(f"Vocab size: {VOCAB_SIZE}")

Unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


In [16]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

def encode(text):
    return [stoi[c] for c in text]

def decode(indices):
    return ''.join([itos[i] for i in indices])

In [17]:
encoded = encode("hello!")
print(f"Encoded: {encoded}")
decoded = decode(encoded)
print(f"Decoded: {decoded}")

Encoded: [46, 43, 50, 50, 53, 2]
Decoded: hello!


In [20]:
# convert to torch tensor of encoded text and print
import torch

train_data_tensor = torch.tensor(encode(text_data), dtype=torch.long)
train_data_tensor.shape

torch.Size([1115394])

In [21]:
train_data_tensor[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [22]:
text_data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [38]:
# Train and validation split
TRAIN_SPLIT = 0.9
train_data = train_data_tensor[:int(len(train_data_tensor)*0.9)]
val_data = train_data_tensor[int(len(train_data_tensor)*0.9):]
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [27]:
# Box size = context window
# visualize a block of text (size = block_size + 1)

BLOCK_SIZE = 8
block = train_data[:BLOCK_SIZE + 1]
print(f"Block indices: {block.tolist()}")


Block indices: [18, 47, 56, 57, 58, 1, 15, 47, 58]


In [45]:
# For one given block of text, we generate all possible continuations from the start
# This gives us block_size examples that we can use to train the model

X = train_data[: BLOCK_SIZE]
y = train_data[1:BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])
For an input block: [18, 47, 56, 57, 58, 1, 15, 47]
For input: [18], Next token would be: 47
For input: [18, 47], Next token would be: 56
For input: [18, 47, 56], Next token would be: 57
For input: [18, 47, 56, 57], Next token would be: 58
For input: [18, 47, 56, 57, 58], Next token would be: 1
For input: [18, 47, 56, 57, 58, 1], Next token would be: 15
For input: [18, 47, 56, 57, 58, 1, 15], Next token would be: 47
For input: [18, 47, 56, 57, 58, 1, 15, 47], Next token would be: 58


In [59]:
# To generate a batch of example tensor using the block code above:
random_idx = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (1,)).item()
X = train_data[random_idx: random_idx+BLOCK_SIZE]
y = train_data[random_idx+1:random_idx+BLOCK_SIZE+1] # y is just train data offset by one, as the task of the model is just to predict the next token given the previous sequence of tokens (or just last token in case of a bigram model)

print(f"For an input block: {X.tolist()}")
for t in range(BLOCK_SIZE):
    print(f"For input: {X[:t+1].tolist()}, Next token would be: {y[t]}")
    
    

For an input block: [53, 44, 1, 59, 57, 10, 1, 58]
For input: [53], Next token would be: 44
For input: [53, 44], Next token would be: 1
For input: [53, 44, 1], Next token would be: 59
For input: [53, 44, 1, 59], Next token would be: 57
For input: [53, 44, 1, 59, 57], Next token would be: 10
For input: [53, 44, 1, 59, 57, 10], Next token would be: 1
For input: [53, 44, 1, 59, 57, 10, 1], Next token would be: 58
For input: [53, 44, 1, 59, 57, 10, 1, 58], Next token would be: 46


In [70]:
# Stacking
# To generate a batch of example tensor using the block code above:
BATCH_SIZE = 4

indices = torch.randint(0, len(train_data) - BLOCK_SIZE - 1, (BATCH_SIZE,))
X = torch.stack([train_data[idx: idx+BLOCK_SIZE] for idx in indices])
y = torch.stack([train_data[idx+1: idx+BLOCK_SIZE+1] for idx in indices])

print(X)
print(y)

tensor([[42, 39, 63,  0, 13, 57,  1, 47],
        [ 1, 21,  1, 41, 39, 52, 52, 53],
        [ 1, 61, 47, 50, 50,  1, 54, 56],
        [46, 43,  1, 49, 47, 52, 45,  1]])
tensor([[39, 63,  0, 13, 57,  1, 47, 57],
        [21,  1, 41, 39, 52, 52, 53, 58],
        [61, 47, 50, 50,  1, 54, 56, 39],
        [43,  1, 49, 47, 52, 45,  1, 46]])


In [71]:
# This X and y batch are both (B,T) matrices, 
#   where the B dim is the blocks across a BATCH
#   T dim is across a block through TIME
# For any given block Bx, for any given sequence Bx[start:end], the next token in the sequence would be By[end]
for block in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        print(f"For input tensor: {X[block, :t+1]}, Target is: {y[block, t]}")

For input tensor: tensor([42]), Target is: 39
For input tensor: tensor([42, 39]), Target is: 63
For input tensor: tensor([42, 39, 63]), Target is: 0
For input tensor: tensor([42, 39, 63,  0]), Target is: 13
For input tensor: tensor([42, 39, 63,  0, 13]), Target is: 57
For input tensor: tensor([42, 39, 63,  0, 13, 57]), Target is: 1
For input tensor: tensor([42, 39, 63,  0, 13, 57,  1]), Target is: 47
For input tensor: tensor([42, 39, 63,  0, 13, 57,  1, 47]), Target is: 57
For input tensor: tensor([1]), Target is: 21
For input tensor: tensor([ 1, 21]), Target is: 1
For input tensor: tensor([ 1, 21,  1]), Target is: 41
For input tensor: tensor([ 1, 21,  1, 41]), Target is: 39
For input tensor: tensor([ 1, 21,  1, 41, 39]), Target is: 52
For input tensor: tensor([ 1, 21,  1, 41, 39, 52]), Target is: 52
For input tensor: tensor([ 1, 21,  1, 41, 39, 52, 52]), Target is: 53
For input tensor: tensor([ 1, 21,  1, 41, 39, 52, 52, 53]), Target is: 58
For input tensor: tensor([1]), Target is: 61

In [72]:
# get batch -> x, y for a random batch (of blocks)
# Note 1

def get_batch(data, batch_size, block_size):
    indices = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    X = torch.stack([data[idx: idx+block_size] for idx in indices])
    y = torch.stack([data[idx+1: idx+block_size+1] for idx in indices])
    
    return X, y

get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)

(tensor([[42,  1, 58, 46, 43, 51,  1, 41],
         [53, 50, 43, 52, 58,  6,  1, 39],
         [53, 51, 43,  1, 54, 47, 58, 63],
         [56, 53, 54, 43,  7, 58, 56, 47]]),
 tensor([[ 1, 58, 46, 43, 51,  1, 41, 53],
         [50, 43, 52, 58,  6,  1, 39, 52],
         [51, 43,  1, 54, 47, 58, 63,  8],
         [53, 54, 43,  7, 58, 56, 47, 41]]))

In [80]:
# Bigram language model that uses nn.Embedding to generate Logits, cross entropy loss ### Comeback and try making a trigram model too
# Check dimenstionality of input and output
# Note 2
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(input_vocab_size, output_vocab_size)

        # nn.embedding is essentially a matrix,
        # I don't know how it works
        # But here's my guess
        # It is a (input_values, output_values) matrix, where the input token equals the possible input variation
        # In a bigram model, where we look at one past character to predict the next one, input_values = output_values = charater_set_length
        # In a trigram model, where we look at two past characters to predict on next character, output_value = charater_set_length, input_values = charater_set_length ** 2
        # In this trigram model, the embedding matric would me (charater_set_length ** 2, charater_set_length)
        # In LLM, I am guessing this is a (vocab_size, vocab_size) matrix
        
        
        # Now as to what it does, the forward pass using this as a lookup table of probabilities,
        # For any given row, we treat the column values as the probabilities of which output token should come next
        # This way when we back prop, we are optimizing for the lookup table to resemble a probability distribution matrix as done is makemore
        
        # Apparently I am precisely correct about how nn.Embedding does. This might as well have been implemented with a normal (input_values, output_values) Tensor with gradients and it would've worked just the same


    def forward(self, idx, target=None):
        # x is shape (B, T), where B is batch size, T is time steps (or block size)
        x = self.embedding_lookup_table(idx)
        # x is now (B, T, C) where C is the character set size (vocab size)
        if not target:
            return x, None
        return x , f.cross_entropy()
            


In [82]:
# for the forward pass to work, cross_entropy expects input, target. BUTTTTT Andrej mentioned something the input from B,T,C to B,C, T, will run and test
model = BigramLanguageModel(VOCAB_SIZE, VOCAB_SIZE)

X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)

In [94]:
# Pass one token, which is the last token of the input sequence, because this is a bigram model
# But the input sequnce in a moving window, as we printed above
# So the actual lookup indices should be a (B*BLOCK_SIZE, C)
# But I don't know how to implement that, so I am justs going to get one example per block for the sack of making the forward pass work

print(model.embedding_lookup_table(X[:, -1]).shape), print(y[:, -1].shape)
# This gives me (BATCH_SIZE, VOCAB_SIZE), where the 65 is the vocab size logtis that I am assuming are the probabilites of what the next token should be

torch.Size([4, 8, 65])
torch.Size([4])


(None, None)

In [89]:
# torch.distributions.Polynomial() # I don't know how this will work in pytorch, but I am guessing after we get the embedding logits,
# We we sample for a distribution of that embedding, which will give us the index of the next token, that will be our actual output

In [93]:
seq = model.embedding_lookup_table(X[:, -1])
target = y[:, -1]
F.cross_entropy(seq, target)

# This works, but again, this is (B, C) against (B), I have no idea how to make this work in (B, T, C) against (B, T)

tensor(5.5493, grad_fn=<NllLossBackward0>)

In [95]:
# Oh, I keep struggling because I keep trying to write the most general solution ever
# This is a bigram model, model.embedding_lookup_table(X) for a (B, T) matrix will give me (B, T, C). This is because in bigram models, only the current token is needed to predict the next token
# No need to do the whole generate completel sequence thing. This will not work for Trigram, or atleast not exactly like this
print(model.embedding_lookup_table(X).shape), print(y.shape)


torch.Size([4, 8, 65])
torch.Size([4, 8])


(None, None)

In [97]:
seq = model.embedding_lookup_table(X)
target = y
F.cross_entropy(seq, target)

# F.cross_entropy(seq, target) given an error, RuntimeError: Expected target size [4, 65], got [4, 8]
# I am guessing this is because, my target is indices, not probablity values to match with. So cross entropy thinks that in my input (B, T, C). T is the probabilities, and C is the classes.
# essentially (batch, probability, class)
# So for this I will transform seq but only the two inner dimensions, so that is goes from (B, T, C) to (B, C, T)

RuntimeError: Expected target size [4, 65], got [4, 8]

In [100]:
seq.transpose(-2, -1).shape
# Looks good

torch.Size([4, 65, 8])

In [101]:
F.cross_entropy(seq.transpose(-2, -1), target)

tensor(4.8080, grad_fn=<NllLoss2DBackward0>)

In [None]:
# Boom!

In [116]:
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        # This is how I did it:
        # x = self.embedding_lookup_table(idx)
        # if target is None:
        #     return x, None
        # return x , F.cross_entropy(x.transpose(-2, -1), target)

        # this is how Andrej did it
        logits = self.embedding_lookup_table(idx)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = target.view(B*T)
        loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def my_forward(self, idx, target=None):
        x = self.embedding_lookup_table(idx)
        if target is None:
            return x, None
        return x , F.cross_entropy(x.transpose(-2, -1), target)

X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
model = BigramLanguageModel(VOCAB_SIZE)
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

# Well my and Andrej's methods are giving different results.... welp lemme chatgpt what is the differenct


(torch.Size([32, 65]), torch.Size([]))

In [117]:
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

(torch.Size([32, 65]), tensor(4.6674, grad_fn=<NllLossBackward0>))

In [118]:
y_pred , loss = model.my_forward(X, y)
y_pred.shape, loss

(torch.Size([4, 8, 65]), tensor(4.6674, grad_fn=<NllLoss2DBackward0>))

In [None]:
# I have no idea what changed (Edit: I do now, different batch in the first run), but by implementing them both, very clearly they are matehmatically equivalent
# I'm a genius!
# Look Ma! No hands!
# Now let's cleanup, and implement a max_token limited generate function to let the model babble

In [159]:
import torch
from torch import nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.embedding_lookup_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        x = self.embedding_lookup_table(idx)
        if target is None:
            return x, None
        return x , F.cross_entropy(x.transpose(-2, -1), target)

    def generate(self, idx, max_new_tokens): # idx is the start seed token
        # idx is [B, T], ideally where T=1, so a batch of input seed tokens
        for _ in range(max_new_tokens):
            logits, _ = self(idx) # Generate logits for what should come next for each B
            logits = logits[:, -1,:]
            probs = F.softmax(logits, dim=1) # We softmax the logits to convert them into probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # We pick the next token from a distribution 
            idx = torch.cat((idx, idx_next), dim = 1) # We concatenate the newly generated token to the end of the starting sequence
        return idx



X, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
model = BigramLanguageModel(VOCAB_SIZE)
y_pred , loss = model.forward(X, y)
y_pred.shape, loss

(torch.Size([4, 8, 65]), tensor(4.3515, grad_fn=<NllLoss2DBackward0>))

In [169]:
print(decode(model.generate(torch.zeros(1, 1, dtype=torch.long), 1000)[0].tolist()))


Hm!pBQGR
h:
kqX3I!oMbB&TSsDMIizI!rF
xjpAwI-Ry.GjLGwatYyBquxjFOBYZeOEYFObrQOB&Lr-fK&tu-yTeB3U,SfnjGYF&'GN!$OBTZ 3V hfoQOBTpd'tpA-&w-ctxbmDdW:W$I'cHdlfPJolw!Ii.CnuB!ooCuWoqM?R-!B.anOB3o$pNLjYF'EEYUtPnM;OwNEJSdkMt3u'Z;OBLTSFNDk:v,;&fk:vpvrhiEAx.ddoyc!jAy.'tRY;OBky $Vv;UpdjMtxSuTthbaTH oZhy!?LQY;OMc ;OB!Ikgx
&TOBvEoB&FKoleMGYVK
G:gPb-yGthAqfoU,OBTZelR&OM?t&MUvLqN'vvbsv.Gs;hUNJ
ZTlSBzgpH!J&DS&U,qUvFAo3ErGgjai.
v SjfI3Id;,sDYfPxVQF'.yd'JSuk:wICpO BOJApdPOgwjM?W:t'Qy.&hank:VXbPMI!Wsm-,Krhcl?VY$aUu?ulyo&g'P,Lin3JGjdXYXNE$lfTI'Q;SuvlSdBu!glbf'w;jGR3MlwmbMbBOltzVyA,SB-BO;vJ3ILrypDkAelgD'fo &:.FVnD:anjrhnM3lw!,,weuVWgKoRRqahynDYEQTy sd-lw3$fZfjCux?qdkZm!-sszzmTRvvi$n;
Zx
Q?iRPahuQ;OBgxwevAbNkYFZuF!m!vb
;OBhEOB3Iu tZtuDeYYZfr-GtwbaqfgMZANkazuVXYFsvv.FPFIV'wNEiSXBth,Bvm,tD.PRyUbowI$NSPHD
;cK
Rq-' nCHvb' xXpavJ$r-ouA3JYWbMGAHe' yLNYUc
khcscsPnBHminrh,SLQTE&tZnNEoe?QFjrfoM;FsV3tzgX:;OBYkAYByNE;
Wk!AH$qvbIdXAU
$z.EAYvbE'hiSPKNkkgqpe'tsxqZbGv,Sdg:gvTbOUbTLyGfYxFTlxpELw?EYesFZOcMdiY3lDJVeSuN'eYFrF,wftf

In [190]:
# Rubbish, but expected because the model is untrained. Let's train!
model = BigramLanguageModel(VOCAB_SIZE)

In [205]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32


for steps in range(10000):

    xb, yb = get_batch(train_data, batch_size, BLOCK_SIZE)
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.4207818508148193


In [206]:
print(decode(model.generate(torch.ones(1, 1, dtype=torch.long), 1000)[0].tolist()))

 m fo, g thr.
DWhirdrind Vaxce? fathe ar ie VEThe fo in my: Whe GEENESSThat
D:
Ithay ee s e:
Singlithe angrs,
Yorou

GHarierel magho than uryootofrdin, m me e blofonof.
Thers onk thought thearessk, nang le;
Finceilerisbu orth I fer s thistheaven.
A:
Waigsuralairarwhinomanstt se fongheenon fr. d t berve'eck y loueldind hirelaimucknar I thoupoy N:

Yo umy sw,
thaecty yo won, alirshigorthe storirn hinto 'lise hbbouttlthe,
USh-the?
An sthe ly.

BULTerin Wes p thonwaltovouberon f siery sirofodsucthe dre ows y ot then.
uterorthiviggobour an.


Shasifoofoinoutitins hindi'd hin iusithathaped fullalofo, he'sthote y'ORINGHalle bu hech ten Gllilst my, marron'shehee,
Priate theses t yowich ve lerenerer t pue


ID nwac foushendorof home th at.
ARO ear gin bl hese,
FOFiree

ABrine llo knge Fob ashiabers tin'sowh ftof o, M:
Helo lthese ISe:
Ork theth hay t:
YOfren foend horsofod.
Tit cr or d d pun:
METINRKERDanngeawaclita nn'su orghad I waighefin!
NGENond 'lar ougorir my ke yor
COLONTotho te d,
QUS:


In [None]:
# Looks good enough, let'smove this over to transformer_dev.py