In [1]:
# We always start with a dataset to train on. Lets download the tin shakespeare dataset
!curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  5451k      0 --:--:-- --:--:-- --:--:-- 5473k


In [2]:
# read it to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters:", len(text))


length of dataset in characters: 1115394


In [4]:
# Let's look at the first 1000 characters
print(text[:1000])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars)) # this will just give us a list of all the unique characters in the text
print(vocab_size)




 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# now we are going to be translating the characters to integers, create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars)} # expanding this code, it looks like this:
# for i, ch in enumerate(chars):
#     stoi[ch] = i
itos = { i:ch for i,ch in enumerate(chars)} # expanding this code, it looks like this:
# for i, ch in enumerate(chars):
#     itos[i] = ch
encode = lambda s: [stoi[c] for c in s] # this is the encoder, take a string, output a list of integers. expanding this code, it looks like this:
# def encode(s):
#     return [stoi[c] for c in s]

# and the decoder is just the reverse
decode = lambda l: ''.join([itos[i] for i in l]) # expanding this code, it looks like this:
# def decode(l):
#     return ''.join([itos[i] for i in l])  

# now lets encode and decode some text
print(encode("hii there"))
print(decode(encode("hii there")))


# this is character level training, we are treating the text as a sequence of characters, there are many
# other ways of encoding like word level training, sentence level training, etc., google uses SentencePiece, which is a subword level encoding
# OpenAI uses Byte-Pair Encoding (BPE), which is a subword level encoding
# openai also have a library called tiktoken, which is a subword level encoding


# now this is a tradeoff, like for character level encoding we have 65 maximum tokens, but for lets say BPE we have maybe some 50000 token
# but the encoding sequence length is much smaller so "hii there" will be something [71, 4178, 612] so 3 token, so it's a trade off
# so we can have a long sequence of integers with a small vocabulary or a short sequence of integers with a large vocabulary
# usually companies use subword level encoding but we will do character level encoding for now


[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
# now we encode the entire text dataset and store it in a torch tensor
import torch # using pytorch to store the data
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

# now we can create a dataset class


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
# now we will split our dataset into a train data and validation data
n = int(0.9*len(data)) # 90% of the data will be used for training
train_data = data[:n]
val_data = data[n:]

# this will help us evaluate to what extent our model is overfitting or not

In [9]:
# now we plug in the text sequences into transformer to train on it
# now we dont train on the whole dataset together, this will be very high compute intensive
# so we will take chunks of dataset (Sampling random little dataset), maximum length of 8(for here)

block_size = 8 # this is the number of characters that will be used to predict the next character
train_data[:block_size+1] # plus 1 because we are predicting the next character, we care about the spaces between the characters, because thats how many connections we have, which is 8 and this is important



tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}") ## so it not just predicts whats the next character for one character but the next character for a sequence of characters


# after block size we have to start truncating the data, because transformer will never receive data more than block size
# and after time dimension, we care about another dimension which is the batch dimension, so we will have to start creating batches
# and we can send more than one batch to the transformer at a time, so we will have to start creating batches, 
# this is because our gpu is very good for parallel computations, so to keep them efficient we can send more than one batch to the transformer at a time

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [11]:
torch.manual_seed(1337) # this is to make the random number generator deterministic, so we can get the same results every time we run the code

batch_size = 4 # this is the number of sequences we will send to the transformer at a time
block_size = 8 # this is the number of characters that will be used to predict the next character

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # what is does is it will generate a random integer between 0 and the length of the data minus the block size, and it will generate a batch size number of random integers, so batch_size is 4 
    # so it will generate 4 random integers, and then it will use those integers to index into the data and get the corresponding sequences
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target is {target}")





inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target is 43
when input is [24, 43] the target is 58
when input is [24, 43, 58] the target is 5
when input is [24, 43, 58, 5] the target is 57
when input is [24, 43, 58, 5, 57] the target is 1
when input is [24, 43, 58, 5, 57, 1] the target is 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when input is [44] the target is 53
when input is [44, 53] the target is 56
when input is [44, 53, 56] the target is 1
when input is [44, 53, 56, 1] the target is 58
when input is [44, 53, 56, 1, 58] the target i

In [12]:
## now we have the data, we can start building the model
## instead of using the transformer, we will use a simple neural network called the bigram model for now
## the bigram model is a neural network that takes a sequence of characters and predicts the next character


import torch
import torch.nn as nn

from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # this will give us the logits for the next token (Batch size, sequence length, vocab size)/(B,T,C)
        # logits are scores for the next character, so we can use the softmax function to get the probabilities
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
    
    


torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [13]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32
for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())



2.362441062927246


In [15]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


M:
IUSh t,
F th he d ke alved.
Thupld, cipbll t
I: ir w, l me sie hend lor ito'l an e

I:
Gochosen ea ar btamandd halind
Aust, plt t wadyotl
I bel qunganonoth he m he de avellis k'l, tond soran:

WI he toust are bot g e n t s d je hid t his IAces I my ig t
Ril'swoll e pupat inouleacends-athiqu heamer te
Wht s

MI wect!-lltherotheve t fe;
WAnd py;

PO t s ld tathat, ir V
IO thesecin teot tit ado ilorer.
Ply, d'stacoes, ld omat mealellly yererer EMEvesas ie IZEd pave mautoofareanerllleyomerer but?


*The Mathematical trick in self-attention*

In [16]:
# consider the following toy example

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch size, sequence length, embedding dimensionality (n_embd)[channels]
x = torch.randn(B,T,C)

x.shape # (4, 8, 2)

torch.Size([4, 8, 2])

In [17]:
# we want x[b, t] = mean_{i <= t} x[b,i]
# so what we are doing here is basically taking the mean of all the previous tokens, and then we are using that to predict the next token
# now this is a very simple and naive way to do it, but it's a good starting point
# and we might lose some information but later we will see how we can improve this
# bow  is short for bag of words, it's a very simple and naive way to do it
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, 0)





In [None]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
# now below pytorch will see that we are multiplying a (T, T) matrix with a (B, T, C) matrix 
# so it will create a Batch dimension and this is a batched matrix mutiply
# xbow2 = wei @ x # (T, T) @ (B, T, C) ----->
xbow2 = wei @ x # (T, T) @ (B, T, C) ----->
torch.allclose(xbow, xbow2)



True

In [33]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [38]:
wei = torch.zeros((T,T)) # this is a 8x8 matrix of zeros
wei = wei.masked_fill(tril == 0, float('-inf')) # this is to make the upper triangular part of the matrix 0 because we dont want to consider the future tokens
# wei = F.softmax(wei, dim=-1) # this is to normalize the matrix so that the sum of each row is 1
wei 

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [30]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [32]:
# version 3 (softmax)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei.masked_fill_(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)



True

In [54]:
# version 4 (self-attention)
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch size, time, embedding dimensionality (n_embd)[channels]
x = torch.randn(B,T,C)

# let's see a single Head perform the self-attention
head_size = 16 # this is the number of heads we want to use
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -----> (B, T, T), | -2 and -1 are the dimensions we are transposing



tril = torch.tril(torch.ones(T, T))
# wei stands for weight, it's a 8x8 matrix of zeros
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x
out.shape # (4, 8, 32)






torch.Size([4, 8, 16])

In [55]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [56]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) -----> (B, T, T)


In [57]:
k.var()

tensor(1.0449)

In [58]:
q.var()

tensor(1.0700)

In [59]:
wei.var()

tensor(17.4690)

In [62]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [63]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [19]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
xbow[0] # first row of xbow is exactly the same as the first row of x, because we are taking the mean of the first row of x
## but the second row of xbow is the mean of the first two rows of x and so on

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
torch.tril(torch.ones(3, 3)) # this is a 3x3 matrix of ones, but the lower triangular part is 1 and the upper triangular part is 0

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [26]:
# what we are doing here is basically taking the mean of all the previous tokens, and then we are using that to predict the next token
torch.manual_seed(42) # this is to make the random number generator deterministic, so we can get the same results every time we run the code
a = torch.tril(torch.ones(3, 3)) # we are making the upper triangular part of the matrix 0 because we dont want to consider the future tokens
a = a / torch.sum(a, 1, keepdim=True) # this is to normalize the matrix so that the sum of each row is 1
b = torch.randint(0, 10, (3, 2)).float() # this is a 3x2 matrix of random integers between 0 and 10
c = a @ b # this is the matrix multiplication of a and b, so c is a 3x2 matrix
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)


a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
