# Lecture 19: LLM-Architecture

In [50]:
import torch
import torch.nn as nn
import tiktoken

In [51]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

## Building GPT-Model basic structure Version

In [52]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_embedding = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.positional_embedding = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout_embedding = nn.Dropout(cfg["drop_rate"])

        self.transformer_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias= False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        token_embeds = self.token_embedding(in_idx)
        positional_embeds = self.positional_embedding(torch.arange(seq_len, device=in_idx.device))
        x = token_embeds + positional_embeds
        x = self.dropout_embedding(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
    
    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    
    def forward(self, x):
        return x

## Step 1: Tokenization

In [53]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim=0)
print(batch)


tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


## Step 2: Create an instance of the GPT-Dummy-Model

In [54]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model.forward(batch)
print(f"Output shape:\n{logits}\n{logits.shape}")

Output shape:
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0448,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)
torch.Size([2, 4, 50257])


## *interpretation of tensor shape*
### 2 - number of batches
### 4 - number of tokens per batch
### 50257 - vocabulary size 
### --> output logits hold the propabilities over all the words in the vocabulary, the position of the highest value represents the token from the vocabulary which is most likely to be the next token in the sequence

## each row within one sub block represents the result of one prediction task
### 1st input -> every ---> predicting the word "effort" (1st row in sub block)
### 2nd input -> every effort ---> predicting the word "moves" (2nd row in sub block)
### 3rd input -> every effort moves ---> predicting the word "you" (3rd row in sub block)
### 4th input -> every effort moves you ---> predicting the word "forward" (4th row in the sub block)