<a href="https://colab.research.google.com/github/nimishsoni/GPT-Model/blob/main/GPT_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GPT Model trained on tiny-Shakespeare data
Key Components of the model are:
1. word embedding
2. Positional encoding
3. multi-headed self-attention
4. Transformer encoder block
5. Decoder block

In [1]:
# Download the tiny-Shakesspeare text data from Github repo
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-12-13 12:17:19--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-12-13 12:17:19 (18.4 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

Import data and Explore

In [4]:
# Read the tiny-Shakespeare txt file
with open('input.txt',mode='r',encoding='utf-8',closefd=True) as f:
  text = f.read()

In [5]:
# View file and its stats
print(type(text),len(text),text[1:100])

<class 'str'> 1115394 irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
# List all unique characters used in the text
char1 = set(text)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars),vocab_size,char1)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 65 {'R', 'C', 'I', 'q', '\n', 'W', ':', 'p', 'O', '$', 'U', 'u', 'e', 'g', '3', 'c', 'L', 'n', 'F', 'V', 'E', 'x', 'Z', 'Y', 'v', '.', 'M', '!', 'w', 'X', 'o', 'B', 'H', 'Q', 'a', 't', '-', 'f', 'T', 'h', 'i', 'l', 's', 'b', 'm', ' ', 'D', 'y', 'k', 'K', 'z', ';', 'd', "'", ',', 'J', 'N', '&', 'S', 'j', 'G', 'P', 'r', '?', 'A'}


In [24]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 64 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 50
learning_rate = 5e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 8
n_layer = 4
dropout = 0.1

Tokenize the text data

In [9]:
# Create a mapping of characters and integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
#print(stoi['A'],'\n', itos)

# Encoder takes string as input and provides integers as output.
encode = lambda s: [stoi[c] for c in s]

# Decoder takes list of integers as input an provides string as output
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('Hi. I am Nimish!!'))
print(decode(encode('Hi. I am Nimish!!')))

[20, 47, 8, 1, 21, 1, 39, 51, 1, 26, 47, 51, 47, 57, 46, 2, 2]
Hi. I am Nimish!!


In [11]:
# Lets encode entire text dataset and store it into a torch.tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape,'\n',data.dtype,'\n',data[:100])

torch.Size([1115394]) 
 torch.int64 
 tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


Data prep for training
Split the data in to Train and Val (10%)
Divide the data in to chunks/blocks and batches

In [12]:
# Divide the data in to train (90%) and validation set (10%)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [26]:
# Sample - Creates blocks of input and corresponding target from training data
#block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
print(x,y)
for t in range(block_size):
  context = x[:t+1]
  target = y[t]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50]) tensor([47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53,
        56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,  1,
        44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57,
        54, 43, 39, 49,  8,  0,  0, 13, 50, 50])


In [14]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data

    # Create batch of 4 randomly generated integers within length of text data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Create batch of 4 blocks (each of block size 8) from randomly selected integers for parallel processing
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    #move x, y parameters to GPU if available or on CPU
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [16]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) # Key represents the information each element of sequence holds
        self.query = nn.Linear(n_embd, head_size, bias=False) # Defines the query or search the model is asking for
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)

        # masking out the attention scores in the upper triangular portion of the matrix, the model is forced to only focus on the elements that appear before the current position in the sequence.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        # the model gives more weight to elements that have higher attention scores, meaning they are more relevant to the current query.
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [17]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [18]:
# Simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # Word Embedding
        self.position_embedding_table = nn.Embedding(block_size, n_embd) #Positional Embedding
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [25]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.211777 M parameters
step 0: train loss 4.3437, val loss 4.3353
step 50: train loss 3.1526, val loss 3.1879
step 100: train loss 2.8173, val loss 2.8344
step 150: train loss 2.6620, val loss 2.6657
step 200: train loss 2.5899, val loss 2.5967
step 250: train loss 2.5478, val loss 2.5503
step 300: train loss 2.5194, val loss 2.5209
step 350: train loss 2.4971, val loss 2.5043
step 400: train loss 2.4756, val loss 2.4786
step 450: train loss 2.4538, val loss 2.4593
step 500: train loss 2.4335, val loss 2.4442
step 550: train loss 2.4160, val loss 2.4283
step 600: train loss 2.3981, val loss 2.4099
step 650: train loss 2.3793, val loss 2.3907
step 700: train loss 2.3625, val loss 2.3754
step 750: train loss 2.3517, val loss 2.3624
step 800: train loss 2.3357, val loss 2.3481
step 850: train loss 2.3224, val loss 2.3374
step 900: train loss 2.2962, val loss 2.3138
step 950: train loss 2.2936, val loss 2.3071
step 1000: train loss 2.2762, val loss 2.2919
step 1050: train loss 2.2504, val l