# Building a GPT with Shakespeare's Works

In [1]:
# importing dataset
with open('dataset/input.txt', 'r', encoding='utf-8') as f:
    text=f.read()

# reading out length
print("length of dataset: {}".format(len(text)))

length of dataset: 1115394


In [2]:
# inspecting the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
# looking at all the unique characters in the text (our vocabulary)
vocab = sorted(set(text))
vocab_size = len(vocab)
print(''.join(vocab))
print("vocabulary size: {}".format(vocab_size))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocabulary size: 65


## Creating Character Tokenizer

In [4]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(vocab) }
itos = { i:ch for i,ch in enumerate(vocab) }

# demonstrate mapping of first ten characters

# char to int
print(list(stoi.items())[:10])

# int to char
print(list(itos.items())[:10])

[('\n', 0), (' ', 1), ('!', 2), ('$', 3), ('&', 4), ("'", 5), (',', 6), ('-', 7), ('.', 8), ('3', 9)]
[(0, '\n'), (1, ' '), (2, '!'), (3, '$'), (4, '&'), (5, "'"), (6, ','), (7, '-'), (8, '.'), (9, '3')]


In [5]:
# encoder: take a string, output a list of integers
encode = lambda s: [stoi[c] for c in s] 

# decoder: take a list of integers, output a string
decode = lambda l: ''.join([itos[i] for i in l]) 

print(encode("Hello World"))
print(decode(encode("Hello World")))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]
Hello World


In [6]:
# encoding entire dataset as a torch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])  # inspect the first 1000 characters of the dataset

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

## Splitting into Training and Testing Data

In [7]:
# using a 90% split for training and a 10% split for validation
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [8]:
# block_size is the number of characters we consider to predict the next character
block_size = 8 # context window size
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1] # target for what the model should predict

for t in range(block_size): # crawling along..
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


## Creating Batches

In [10]:
# setting reproducible seed
torch.manual_seed(1337)
batch_size = 4 # number of independent sequences to process in parallel

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [11]:
print(xb) # input for transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [12]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):

    # constructing model
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads out logits for the next token in the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both tensors of integers
        logits = self.token_embedding_table(idx) # (Batch by time by channels)
        
        if targets is None:
            loss = None
        
        else: 
            # reshapping logits to get them in proper shape for torch
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    # history and context before the current token will later be used in prediction
    def generate(self, idx, max_new_tokens):
        # index is the (batch, time) array of indices in the current context
        for _ in range(max_new_tokens):
            # getting all the predictions stored inside the lgoits
            logits, loss = self(idx) # <-- goes to forward function
            # focus only on the last element in the time step (predicting for the next token)
            logits = logits[:, -1, :]
            # apply softmax multi class classification to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution and get one sample, changes dimensions
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # take ints from the sample and append to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # crawling the context forward
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print("Shape:", logits.shape)
print("Loss:", loss)

# creating a 1x1 tensor that starts the sequence with index of "0" or "new line" char
idx = torch.zeros((1,1), dtype=torch.long)

# generate 100 tokens from the model in batches starting with the idx sequence and make it a list to decode it
print("First 100 tokens from the model:", decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

Shape: torch.Size([32, 65])
Loss: tensor(4.8786, grad_fn=<NllLossBackward0>)


First 100 tokens from the model: 
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


Notice how the tokens are garbage and random because our model hasn't been trained yet

Now we will train the model so the outputs are, hopefully!, yes random!

In [13]:
# creating PyTorch optimizer

# we can get away with a higher learning rate because the model is small
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [19]:
batch_size = 32

for steps in range(10000):
    # getting just one batch of data
    xb, yb = get_batch('train')
    
    # forward pass
    logits, loss = m(xb, yb)
    
    # backward pass
    
    # refreshing the optimizer to clear previous gradients
    optimizer.zero_grad(set_to_none=True)
    # backpropagate
    loss.backward()
    # update the parameters according to the optimizer function
    optimizer.step()
    
# printing loss
print(loss.item())

2.4987847805023193


The loss started around 4.7 when first training the model and now we've been able to decrease it to around 2.5!

In [21]:
# now we are going to check the output of the model now that we've trained it

# generate 100 tokens from the model in batches starting with the idx sequence and make it a list to decode it
print("First 300 tokens from the model:", decode(m.generate(idx, max_new_tokens=300)[0].tolist()))

First 300 tokens from the model: 
For, I thy whundlyo d yome PUG XEThrirsanro; shes dur'd, s at CEn CURDYWey t havee to d.
Bucee n r owis poboungalknajus fo tze yonout eit r thom t, ch ar t g
I LOfarsmalle thenierd p ourry, be horar OLI'd TII thithoubepred dar toris, thaums, athacknthene, traMe ars ame nen; frs thenis courit,
An wil


While it still isn't poetry yet... it's much improved!

This was the most simple approach since the most current token isn't being look at in the context of what came before it. Let's change this so that the tokens "talk to each other" which should improve the accuracy!

## Trick for self-attention

In [27]:
# getting average of all the precending elements for the Nth token in the sequence
B,T,C = 4,8,2
x = torch.randn(B,T,C)

# x(Bag of Word)
xbow = torch.zeros((B,T,C))
# iterate over batch dim
for b in range(B):
    # iterate over time dim
    for t in range(T):
        # slice out x
        xprev = x[b,:t+1] # (t,C)
        # averaging out time
        xbow[b,t] = torch.mean(xprev, dim=0)

In [29]:
# first batch
x[0]

tensor([[-0.2426,  0.0536],
        [-1.7227,  0.6309],
        [ 1.1348, -0.2185],
        [-0.2470, -0.7073],
        [-1.4003,  0.2871],
        [ 0.3844, -0.6153],
        [-0.3662, -1.0029],
        [ 0.7314, -0.3779]])

In [30]:
# elements change as more averages and context is taken into account
xbow[0]

tensor([[-0.2426,  0.0536],
        [-0.9826,  0.3423],
        [-0.2768,  0.1553],
        [-0.2694, -0.0603],
        [-0.4955,  0.0092],
        [-0.3489, -0.0949],
        [-0.3514, -0.2246],
        [-0.2160, -0.2438]])

This is a very naive approach, but good for our first try. This is inefficient since we introduce a lot of calculations for all the tokens and we also lose out on a lot of information by just using an average. Let's try this now with matrix multiplication as our next iteration, which will be significantly more efficient.

In [34]:
# demonstrating this
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
# normalizing the rows
a = a/torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print('a=',a)
print('----')
print('b=',b)
print('----')
print('c=',c)

a= tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----
b= tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----
c= tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [41]:
# version 2

# context window represents each row, which will sum to one 
weights = torch.tril(torch.ones(T,T))
weights = weights/weights.sum(1,keepdim=True)
xbow2 = weights @ x #(B,T,T) @ (Batch,Time,Channels) -> (B, T,C)

# checking if the tensors are the same
torch.allclose(xbow,xbow2)

True

In [40]:
xbow[0], xbow2[0]

(tensor([[-0.2426,  0.0536],
         [-0.9826,  0.3423],
         [-0.2768,  0.1553],
         [-0.2694, -0.0603],
         [-0.4955,  0.0092],
         [-0.3489, -0.0949],
         [-0.3514, -0.2246],
         [-0.2160, -0.2438]]),
 tensor([[-0.2426,  0.0536],
         [-0.9826,  0.3423],
         [-0.2768,  0.1553],
         [-0.2694, -0.0603],
         [-0.4955,  0.0092],
         [-0.3489, -0.0949],
         [-0.3514, -0.2246],
         [-0.2160, -0.2438]]))

This shows we can batched matrix multiplication to do a weighted aggregations of previous tokens precending the current one.

In [None]:
# version 3: using Softmax function

# gives us the amount of each token from the past affecting the current token
tril = torch.tril(torch.ones(T,T)) #lower triangular matrix
weights = torch.zeros((T,T)) # starts as a sparce matrix
weights = weights.masked_fill(tril == 0, float('-inf')) # elements where tril = 0 will be set to -inf

# softmax acts as a normalizer, sums elements in row and exponentiates them
weights = F.softmax(weights, dim=-1) # 

xbow3 = weights @ x
torch.allclose(xbow,xbow3)