In [1]:
# lets first load the gpt2 (24M from openai gpt2 paper) and then use the setting of weights that is the 124M model.

In [2]:
# use attention is all you need paper and notice how gpt2 transformer is a decoder only model (cross attention that uses the encoder is also missing). 

In [3]:
#  match up the hugging face transformers scheme!

In [9]:
from dataclasses import dataclass
import torch 
import torch.nn as nn 
from torch.nn import functional as F
import math 
import inspect
master_process = True
# lets first build the SKELETON!!!!
# --------------------------------------------------------------------------
class CausalSelfAttention(nn.Module):

    # tokens are lined up (1024), each token emits three vectors: query, key,value
    # queries and keys have to multiply to get the "amount" of attention
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size))
    

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # attention (materializes the large (T,T) matrix for all the queries and keys)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

        # autoregressive masks only attend to tokens before them 
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))

        # normalizes them
        att = F.softmax(att, dim=-1)

        # attention matrix multiplied by values to get a weighted sum of the tokens
        # that we found interesting
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        # concatenate the tokens
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):
    # The MLP consists of two linear projections, with a GELU nonlinearity in between.
    # GELU (Gaussian Error Linear Unit) is an activation function that smooths out 
    # ReLU-like behavior by using a probabilistic approach based on the Gaussian distribution.
    # Unlike ReLU, which applies a hard threshold, GELU allows small negative values to pass 
    # through, improving gradient flow and stability.
    #
    # GPT-2 uses GELU instead of ReLU because it enables better convergence and 
    # smoother learning dynamics, especially in deep architectures, which would
    # not occur with a RElU because of the dead neuron problem. The difference
    # lies in the tail of a RELU, all activations have no gradient. In GELU, 
    # there are local gradients, which means that dynamic learning ensues.
    # This is why you want to adopt the nonlinearity. 
    
    def __init__(self, config):
        super().__init__()
        self.c_fc       = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu       = nn.GELU(approximate='tanh')  # Tanh approximation speeds up computation
        self.c_proj     = nn.Linear(4 * config.n_embd, config.n_embd)
    
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    
    # Unlike the original Transformer architecture from "Attention Is All You Need",
    # GPT-2 places LayerNorm *before* the attention and MLP layers instead of inside them.
    # This ensures a clean residual pathway for gradients, improving stability during training.
    # In contrast, the original Transformer applied LayerNorm *after* the residual connections,
    # which introduced dependencies that could make training less stable.
    
    # Gradients represent the direction and magnitude of change needed to minimize 
    # the loss function during backpropagation. A clean residual pathway means gradients 
    # can flow more smoothly through the network, reducing issues like vanishing or exploding gradients.

    # Attention is a communication operation - all 1024 tokens that are lined up
    # and communicate, where they exchange information. MLP, however, happens 
    # individually - or in other words, no information is exchanged between the 
    # tokens. So self.attn is the reduce and self.mlp is the MAP! You basically
    # communicate first then think individually about the gathered information.
    # Every transformer block, then, iteratively refines the representations
    # of the residual streams
    def forward(self,x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x 

@dataclass
class GPTConfig:

    # set the hyperparameters to match the gpt2 124M model
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12 
    n_head: int = 12
    n_embd: int = 768

# try to match the hugging face transformers/gpt2 schema (which we saw earlier)
# to do this, we will reflect the transformer container from play.ipynb 
# by using nn.ModuleDict - this allows you to index into the submodules using keys
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config 
        
        self.transformer = nn.ModuleDict(dict(
            # weights of the token (wte) and position (wpe) embedding 
            # (nn embedding is just a fancy wrapper module around a single array 
            # of numbers - around a single tensor, 
            # but it allows you to access into the rows of the weight and position
            # embedding)

            # to reflect the .h0 till .h11 layers, you can use an nn.ModuleList to index 
            # it using integers, just like the .h0 to .h11 schema in hugging faces
            # implementation

            # Moreover, gpt2 paper added another normalization layer, which
            # is reflected here with ln_f
            wte = nn.Embedding(config.vocab_size, config.n_embd), # called output embeddings in attention is all you paper
            wpe = nn.Embedding(config.block_size, config.n_embd), # called positional encodings in attention is all you paper
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # h block is the transformer blocks in attention is all you paper
            ln_f = nn.LayerNorm(config.n_embd), # ln_f is added in the arrow after the transformer architecture in attention is all you paper
        ))

        # gpt2 openai paper also used a final classifier - the llm head,
        # which projects from 768 embedding dimensions all the way to the vocab
        # or token size, which is 50k+
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False) # is the linear part in attention is all you paper

    
    def forward(self, idx, targets=None):
        # index is of shape (B, T) (batch dimension of B and time dimension of up to T, and T cannot be more than block size)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T), using arange which is a range function for pytorch.
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)

        # addition operation of both, which has broadcasting hidden within it (position embeddings are going to be identical for every single row of input, so broadcasting is natural)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer!
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size), which is the tensor that we obtain
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    
    # load the weights from huggingface
    # from_pretrained is a constructor (or class method) in python that returns the GPT object if we just give it the model_type
    @classmethod
    def from_pretrained(cls, model_type):
        # Loads pretrained GPT-2 model weights from huggingface
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # create config object and then add the model parameters
        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]

        # set the hyperparameters to our 124M parameter model (50257, 1024)
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        # build our own model from scratch
        model = GPT(config)

        # create state dict both for our model and the hugging face gpt2 model
        sd = model.state_dict()
        
        # get the keys from the huggingface gpt2 model and copy over those tensors 
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param


        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

# autodetect the device 
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

num_return_sequences = 1
max_length = 200
temperature = 0.7

model = GPT.from_pretrained('gpt2')
# it is a good practice to put the model on eval mode when you're not training it
model.eval()
# move all the computation to a gpu cluster/singular high processing gpu (I rented an H100 from LambdaLabs!, created an instance, and connected this to cursor)
model.to(device)

# prefix tokens
import tiktoken
# get gpt2 encoding (tokenizer for gpt2) and encode the string "Hello, I'm a large language model"

story_prompt = "Once upon a time in a small village nestled between ancient mountains, there lived a curious child who discovered a mysterious door hidden in the forest. When they opened it,"

# Tokenize the prompt
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(story_prompt)
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
x = tokens.to(device)


# generate sequences now!

# Generate the story
# Generate the story
with torch.no_grad():
    for i in range(max_length):
        # Forward pass
        logits, _ = model(x)
        
        # Get the logits for the last position
        logits = logits[:, -1, :]
        
        # Apply temperature sampling (use our custom temperature)
        probs = F.softmax(logits / temperature, dim=-1)
        
        # Sample tokens
        next_tokens = torch.multinomial(probs, num_samples=1)
        
        # Append the new tokens
        x = torch.cat([x, next_tokens], dim=1)
        
        # Print progress
        if i % 20 == 0:
            print(f"Generated {i}/{max_length} tokens")

# Decode and print the story
print("\n==== GENERATED STORY ====\n")
for i in range(num_return_sequences):
    generated_story = enc.decode(x[i].tolist())
    print(generated_story)
    print("\n" + "="*50 + "\n")


using device: mps
loading weights from pretrained gpt: gpt2
Generated 0/200 tokens
Generated 20/200 tokens
Generated 40/200 tokens
Generated 60/200 tokens
Generated 80/200 tokens
Generated 100/200 tokens
Generated 120/200 tokens
Generated 140/200 tokens
Generated 160/200 tokens
Generated 180/200 tokens

==== GENERATED STORY ====

Hello, I'm a large language model, but I don't know quite what to do with it.

I don't know any of the languages that the designers use. Do you think it's still possible to have several languages in one interface?

It's possible. It's very likely that there will still be some languages that will be very similar to what we do now without being confused with the languages that are still used in the past.

People who have this habit of building new ones are not necessarily great people. If there were a question about the future of languages, would a question like "What are the future languages going to be?" be pretty easy?

I think that there's a lot of possibili