# **Building a character level Generative Pretrained Transformer (GPT) Model from scratch trained on Shakespeare Dataset**

## Importing Essential Libraries

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
torch.manual_seed(1337)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Enabling CUDA for Faster Computation and Parallelization

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Reading input Text Data

In [None]:
# Reading Input data
filename = 'input.txt'

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open(filename, 'r') as f:
    text = f.read()
f.close()

--2023-06-17 14:07:00--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2023-06-17 14:07:00 (28.5 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



## Tokenization of the Vocabulary List

In [None]:
#  Tokenization
vocab = sorted(list(set(text)))
word2idx = {vocab[i]: i for i in range(len(vocab))}
idx2word = {i: vocab[i] for i in range(len(vocab))}
def encode(s): return [word2idx[c] for c in s]
def decode(i): return ''.join([idx2word[idx] for idx in i])


data = torch.tensor(encode(text), dtype=torch.long)
split = int(0.9*len(data))
trainData = data[:split]
testData = data[split:]

## Defining Function to create Batch Data

In [None]:
# Batch Generator
def createBatch(split):
    data = trainData if split == 'train' else testData
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

## Defining Loss Function

In [None]:
#%% Defining Loss Function
@torch.no_grad()
def estimateLoss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = createBatch(split)
            logits, loss = model(x, y)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

## Defining Class for a simple Bigram Model with just Embedding Layer

In [None]:
#%% Bigram Model
class BiGramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.batch_size = 32

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is of shape (B,T)
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # becomes (B,C) as only last T is selected
            logits = logits[:,-1, :]

            # converting logits to probabilities
            probs = F.softmax(logits, dim=-1)

            # sampling from the prob distribution
            idx_next = torch.multinomial(probs, num_samples=1)

            # append smapled index to the running sequence (B, T+1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

## Defining a Class for Self attention model as that of Decoder in the Transformer Architecture

In [None]:
#%% SelfAttentionModel
class SelfAttentionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_eDim)
        self.positional_encoding_table = nn.Embedding(block_size,n_eDim)
        # self.sa_heads = MultiHeadAttention(4, n_eDim//4)
        self.blocks = nn.Sequential(
            Block(n_eDim, num_heads = 6),
            Block(n_eDim, num_heads = 6),
            Block(n_eDim, num_heads = 6),
            Block(n_eDim, num_heads = 6),
            Block(n_eDim, num_heads = 6),
            Block(n_eDim, num_heads = 6),
            nn.LayerNorm((n_eDim))
            )
        self.lm_head = nn.Linear(n_eDim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_embds = self.token_embedding_table(idx) #(B,T,C)
        pos_embds = self.positional_encoding_table(torch.arange(T, device=device)) #(T,C)
        x = token_embds + pos_embds #(B,T,C)
        x = self.blocks(x) #(B,T,C)
        # x = self.ffw(x) #(B,T,C)
        logits = self.lm_head(x) #(B,T, vocab_size)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is of shape (B,T)
        for _ in range(max_new_tokens):
            idx_crop = idx[:,- block_size:]
            logits, loss = self(idx_crop)
            # becomes (B,C) as only last T is selected
            logits = logits[:,-1, :]

            # converting logits to probabilities
            probs = F.softmax(logits, dim=-1)

            # sampling from the prob distribution
            idx_next = torch.multinomial(probs, num_samples=1)

            # append smapled index to the running sequence (B, T+1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

## Defining a Class for the single self-Attention head

In [None]:
#%% Self Attention Head
class AttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_eDim, head_size, bias = False)
        self.query = nn.Linear(n_eDim, head_size, bias = False)
        self.value = nn.Linear(n_eDim, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (B,T, head_size
        q = self.query(x) # (B,T, head_size)
        wei = q @ k.transpose(-2,-1)* C**-0.5 #(B,T,C) @ (B,C,T) --> (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) #(B,T,T)
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

## Defining a class for Feef Forward head

In [None]:
#%% Feed Forward Layer
class FeedForward(nn.Module):
    def __init__(self, n_eDim):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_eDim, 4*n_eDim),
            nn.ReLU(),
            nn.Linear(4*n_eDim, n_eDim),
            nn.Dropout(dropout))

    def forward(self, x):
        return self.net(x)

## Defining a Class for the multiple self-Attention heads

In [None]:
#%% Multi-Head Attention Model
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.mheads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_eDim, n_eDim)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        out = torch.cat([h(x) for h in self.mheads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)
        return  out

## Defining a class that assembles attention and feedforward heads

In [None]:
#%% Decoder block
class Block(nn.Module):
    def __init__(self, n_eDim, num_heads):
        super().__init__()
        head_size = n_eDim//num_heads
        self.sa = MultiHeadAttention(num_heads, head_size)
        self.ffw = FeedForward(n_eDim)
        self.lnorm1 = nn.LayerNorm(n_eDim)
        self.lnorm2 = nn.LayerNorm(n_eDim)


    def forward(self, x):
        x = x + self.sa(self.lnorm1(x))
        x = x + self.ffw(self.lnorm2(x))
        return x

## Defining a class for Batch normalization in 1D

In [None]:
#%% Layer/Batch Normalization
class BatchNorm1D:
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self,x):
        xmean = x.mean(1, keepDim = True) #batch mean
        xvar = x.var(1, keepDim = True)
        xhat = (x-xmean)/torch.sqrt(xvar + self.eps)
        self.out = self.gamma*xhat  + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

## Block of code that performs GPT model training

In [None]:
#%%
block_size = 64
batch_size = 256
max_iter_num = 5000
iter_interval = 500
eval_iters = 200
n_eDim = 384
num_heads = 6
num_layers = 6
vocab_size = len(vocab)
lr = 3e-4
dropout = 0.2

x, y = createBatch('train')

m = SelfAttentionModel()
model = m.to(device)
logits, loss = m(x, y)

optimizer = torch.optim.AdamW(m.parameters(), lr=lr)
for iter in tqdm(range(max_iter_num)):

    # Verbose
    if iter % iter_interval == 0:
        losses = estimateLoss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    #create Batches
    xb, yb = createBatch('train')

    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/5000 [00:00<?, ?it/s]

step 0: train loss 4.3689, val loss 4.3670


 10%|█         | 500/5000 [04:27<31:13,  2.40it/s]

step 500: train loss 1.6473, val loss 1.8088


 20%|██        | 1000/5000 [08:52<27:56,  2.39it/s]

step 1000: train loss 1.4336, val loss 1.6295


 30%|███       | 1500/5000 [13:16<24:32,  2.38it/s]

step 1500: train loss 1.3363, val loss 1.5687


 40%|████      | 2000/5000 [17:40<20:39,  2.42it/s]

step 2000: train loss 1.2800, val loss 1.5380


 50%|█████     | 2500/5000 [22:03<17:12,  2.42it/s]

step 2500: train loss 1.2406, val loss 1.5256


 60%|██████    | 3000/5000 [26:27<13:49,  2.41it/s]

step 3000: train loss 1.2047, val loss 1.5208


 70%|███████   | 3500/5000 [30:51<10:18,  2.43it/s]

step 3500: train loss 1.1692, val loss 1.5166


 80%|████████  | 4000/5000 [35:14<06:49,  2.44it/s]

step 4000: train loss 1.1359, val loss 1.5219


 90%|█████████ | 4500/5000 [39:37<03:27,  2.42it/s]

step 4500: train loss 1.1066, val loss 1.5228


100%|██████████| 5000/5000 [44:00<00:00,  1.89it/s]


## Block of code that prints the model summary and weight dictionaries

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in m.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

## Saving the trained Model

In [None]:
PATH = 'drive/MyDrive/Colab Notebooks'
torch.save(m,'/content/drive/My Drive/model.pth')

## Generating Text from the trained GPT model

In [None]:
#%% Generate from the Model
context = torch.zeros((1, 1), dtype=torch.long, device = device)
print(decode(m.generate(context,max_new_tokens=1000)[0].tolist()))


What we'll say this issue forward not vex'd.

PAULINA:
No, gentle unsulen begians,
Yet father of mine. For this faith, I pray?

SAMPSON:
Great Claudio, to thy bed;
To her willy set's and a phins child not of war,
Still our mise-waters: come away in, for Warwick's name
For his ladition; a great day;
Impropering to be sured to his term iron,
Whereof treason? I will beat your black counsel
cracked withal
The crims of ears. But how a idle due that's
the wish chines, the spoils of my Ments did, and my
friend, the blood-stark is a piwer to power.

Clown:
Then goes, Aumerle, sir, to give thee to grief;
Let me command, come; to six her to antooth help
of his is wit, they shall say 'twere let me
Ascripture my guilt wasls; come, now back.

KING EDWARD IV:
Son, Claudio hath done me at the Enforce!

FRIZABETH:
Ay, back again to you the reason's weak;
And so ill she would green courtesy to nother
Many, knightly ears, but advanted. City,
Since it is Christen'd from Warwick, and said
That I'll not s