In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import tiktoken
import numpy as np
import torch.nn.functional as F

In [2]:
GPT_CONFIG_SMALL = {
    "vocab_size": 50257,    # Vocabulary size
    "n_layers": 12,         # Number of layers
    "n_heads": 12,          # Number of attention heads
    "emb_dim": 768,         # Embedding dimension
    "context_len": 1024,    # Context length
    "drop_rate": 0.1,       # Dropout rate 
    "qkv_bias": False,      # Query-Key-Value bias
    "batch_size": 2,        # Batch size
}

GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,
    "n_layers": 24,
    "n_heads": 16,
    "emb_dim": 1024,
    "context_len": 1024,
    "drop_rate": 0.1,
    "qkv_bias": False,
    "batch_size": 2,
}

GPT_CONFIG_LARGE = {
    "vocab_size": 50257,
    "n_layers": 36,
    "n_heads": 20,
    "emb_dim": 1280,
    "context_len": 1024,
    "drop_rate": 0.1,
    "qkv_bias": False,
    "batch_size": 2,
}

GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "n_layers": 48,
    "n_heads": 25,
    "emb_dim": 1600, 
    "context_len": 1024,
    "drop_rate": 0.1,
    "qkv_bias": False,
    "batch_size": 2,
}

In [3]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, context_size=4, stride=1):
        self.x = []
        self.y = []
        enc_txt = tokenizer.encode(text)
        for i in range(0, len(enc_txt) - context_size, stride):
            x = enc_txt[i:i+context_size]
            y = enc_txt[i+1:i+context_size+1]
            self.x.append(torch.tensor(x))
            self.y.append(torch.tensor(y))
        self.x = torch.stack(self.x)
        self.y = torch.stack(self.y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def load_data(self, batch_size=32, shuffle=True):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, drop_last=True, num_workers=0)

In [4]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        
    def forward(self, x):
        tok_embed = self.token_embedding(x)
        pos_embed = self.position_embedding(torch.arange(x.shape[1]))
        return tok_embed + pos_embed

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, n_heads, drop_rate, qkv_bias):
        super().__init__()
        self.d_out = d_out
        self.n_heads = n_heads
        self.head_dim = d_out // n_heads
        self.q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(drop_rate)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
        
    def forward(self, x):
        B, T, C = x.shape
        Q = self.q(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.k(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.v(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        attention_scores = Q @ K.transpose(-2, -1)
        mask = self.mask[:T, :T]
        attention_scores = attention_scores.masked_fill(mask.bool(), -torch.inf)
        attention_weights = torch.softmax(attention_scores / (self.head_dim ** 0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        context_vectors = attention_weights @ V
        context_vectors = context_vectors.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(context_vectors)

In [6]:
class LayerNorm(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(cfg["emb_dim"]))
        self.bias = nn.Parameter(torch.zeros(cfg["emb_dim"]))
        self.eps = 1e-5
        
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        normalized = (x - mean) / (std + self.eps)
        return normalized * self.weight + self.bias

In [7]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

In [8]:
class MLP(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
        
    def forward(self, x):
        return self.layers(x)

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_len"],
            n_heads=cfg["n_heads"],
            drop_rate=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = MLP(cfg)
        self.norm_1 = LayerNorm(cfg)
        self.norm_2 = LayerNorm(cfg)
        self.dropout = nn.Dropout(cfg["drop_rate"])
        
    def forward(self, x):
        short_cut = x
        x = self.norm_1(x)
        x = self.att(x)
        x = self.dropout(x)
        x = x + short_cut

        short_cut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + short_cut
        return x

In [10]:
class GPT2(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.embedding_layer = EmbeddingLayer(cfg["vocab_size"], cfg["emb_dim"], cfg["context_len"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.ln_f = LayerNorm(cfg)
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
        
    def forward(self, x):
        x = self.drop_emb(self.embedding_layer(x))
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.out_head(x)

In [11]:
with open("../the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")
dataset = GPTDataset(raw_text, tokenizer, GPT_CONFIG_SMALL["context_len"])
dataloader = iter(dataset.load_data(GPT_CONFIG_SMALL["batch_size"]))
model = GPT2(GPT_CONFIG_SMALL)



In [13]:
x, y = next(dataloader)
out = model(x)
B, T, C = out.shape
out = out.view(B * T, C)
y = y.view(B * T)
loss = F.cross_entropy(out, y)
print(loss)

tensor(10.9871, grad_fn=<NllLossBackward0>)


In [14]:
emb_params = sum(p.numel() for p in model.embedding_layer.parameters())
block_params = sum(p.numel() for p in model.blocks.parameters()) 
ln_params = sum(p.numel() for p in model.ln_f.parameters())
out_params = sum(p.numel() for p in model.out_head.parameters())

print("Embedding layer:", emb_params)
print("Transformer blocks:", block_params)
print("Final layer norm:", ln_params) 
print("Output head:", out_params)

total_direct = sum(p.numel() for p in model.parameters())
total_summed = emb_params + block_params + ln_params + out_params

print("\nTotal parameters (direct):", total_direct)
print("Total parameters (summed):", total_summed)
assert total_direct == total_summed, "Parameter counts don't match!"

Embedding layer: 39383808
Transformer blocks: 85026816
Final layer norm: 1536
Output head: 38597376

Total parameters (direct): 163009536
Total parameters (summed): 163009536


In [15]:
bytes_per_param = 4
total_bytes = total_direct * bytes_per_param
total_mb = total_bytes / (1024 * 1024)

print(f"\nModel size in memory: {total_mb:.2f} MB")
print(f"Model size in memory: {total_mb/1024:.2f} GB")


Model size in memory: 621.83 MB
Model size in memory: 0.61 GB


In [16]:
small_model = GPT2(GPT_CONFIG_SMALL)
medium_model = GPT2(GPT_CONFIG_MEDIUM)
large_model = GPT2(GPT_CONFIG_LARGE)
xl_model = GPT2(GPT_CONFIG_XL)
print(f"Number of parameters (small): {sum(p.numel() for p in small_model.parameters())}")
print(f"Number of parameters (medium): {sum(p.numel() for p in medium_model.parameters())}")
print(f"Number of parameters (large): {sum(p.numel() for p in large_model.parameters())}")
print(f"Number of parameters (xl): {sum(p.numel() for p in xl_model.parameters())}")

Number of parameters (small): 163009536
Number of parameters (medium): 406212608
Number of parameters (large): 838220800
Number of parameters (xl): 1637792000


In [17]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        x = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [18]:
GPT_CONFIG_TEST = {
    "vocab_size": 50257,    # Vocabulary size
    "n_layers": 12,         # Number of layers
    "n_heads": 12,          # Number of attention heads
    "emb_dim": 768,         # Embedding dimension
    "context_len": 5,       # Context length
    "drop_rate": 0.1,       # Dropout rate 
    "qkv_bias": False,      # Query-Key-Value bias
    "batch_size": 2,        # Batch size
}

In [19]:
tokenizer = tiktoken.get_encoding("gpt2")
dataset = GPTDataset(raw_text, tokenizer, GPT_CONFIG_TEST["context_len"])
dataloader = iter(dataset.load_data(GPT_CONFIG_TEST["batch_size"]))
model = GPT2(GPT_CONFIG_TEST)
x, y = next(dataloader)

In [22]:
print(x.shape)
print("\n=== Input Sequences ===")
print("-" * 50)
for i in range(x.shape[0]):
    print(f"Sequence {i+1}:")
    print(tokenizer.decode(x[i].tolist()))
    print()

model.eval()
idx_out = generate_text_simple(model, x, 5, GPT_CONFIG_TEST["context_len"])

print(idx_out.shape)
print("=== Generated Outputs ===") 
print("-" * 50)
for i in range(idx_out.shape[0]):
    print(f"Generated sequence {i+1}:")
    decoded_out = tokenizer.decode(idx_out[i].tolist())
    print(decoded_out)
    print()

torch.Size([2, 5])

=== Input Sequences ===
--------------------------------------------------
Sequence 1:
.

"Well

Sequence 2:
ered his art, it

torch.Size([2, 10])
=== Generated Outputs ===
--------------------------------------------------
Generated sequence 1:
.

"Well solidarity unseen Assets Litop

Generated sequence 2:
ered his art, it lamented patioussian PBS manufacturers

