In [1]:
from importlib.metadata import version

print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

matplotlib version: 3.10.1
torch version: 2.5.1
tiktoken version: 0.9.0


# Exercise 4.1

feed forward와 attention module의 파라미터 수를 구하라

In [2]:
from ch04 import FeedForward, MultiHeadAttention, GPT_124M_CONFIG

In [3]:
import torch.nn as nn

def count_parameter(m: nn.Module):
    return sum([p.numel() for p in m.parameters()])

In [4]:
ff = FeedForward(cfg=GPT_124M_CONFIG)
attn = MultiHeadAttention(
    d_in=GPT_124M_CONFIG.emb_dim,
    d_out=GPT_124M_CONFIG.emb_dim,
    num_heads=GPT_124M_CONFIG.n_heads,
    qkv_bias=GPT_124M_CONFIG.qkv_bias,
    dropout=GPT_124M_CONFIG.drop_ratio,
)

In [5]:
print(f"Feed Forward count: {count_parameter(ff)}")
print(f"MHA count: {count_parameter(attn)}")

Feed Forward count: 4722432
MHA count: 2360064


# Exercise 4.2

GPT2 medium, GPT large, GPT Xlarge 모델을 직접 만드시요.
일단 허깅페이스 config 부터 가져오자
 

In [6]:
from ch04 import GPTConfig

In [7]:
# from https://huggingface.co/openai-community/gpt2-medium/blob/main/config.json
GPT_MEDIUM_CONFIG = GPTConfig(
    vocab_size=50257,
    max_context_length=1024, # n_position
    emb_dim=1024,
    n_heads=16,
    n_layers=24,
    drop_ratio=0.1,
    qkv_bias=False
)

# from https://huggingface.co/openai-community/gpt2-large/blob/main/config.json
GPT_LARGE_CONFIG = GPTConfig(
    vocab_size=50257,
    max_context_length=1024, # n_position
    emb_dim=1280,
    n_heads=20,
    n_layers=36,
    drop_ratio=0.1,
    qkv_bias=False
)

# from https://huggingface.co/openai-community/gpt2-xl/blob/main/config.json
GPT_XL_CONFIG = GPTConfig(
    vocab_size=50257,
    max_context_length=1024, # n_position
    emb_dim=1600,
    n_heads=25,
    n_layers=48,
    drop_ratio=0.1,
    qkv_bias=False
)

In [9]:
from ch04 import GPTModel

In [10]:
medium = GPTModel(GPT_MEDIUM_CONFIG)

여기 밑에 있는 건 사이즈가 커서 실행하는 거 생략한다.

In [None]:
large = GPTModel(GPT_LARGE_CONFIG)

: 

In [None]:
xl = GPTModel(GPT_XL_CONFIG)

# Exercise 4.3

dropout을 사용했는데 이를 각 모듈별로 나누어서 설정해 보자

In [8]:
from dataclasses import dataclass

@dataclass
class ExtendedGPTConfig:
    vocab_size: int
    max_context_length: int
    emb_dim: int
    n_heads: int
    n_layers: int
    drop_rate_emb: float
    drop_rate_attn: float
    drop_rate_shortcut: float
    qkv_bias: bool
    
GPT_124M_CONFIG = ExtendedGPTConfig(
    vocab_size=50257,
    max_context_length=1024,
    emb_dim=768,
    n_heads=12,
    n_layers=12,
    drop_rate_emb=0.1,
    drop_rate_attn=0.1,
    drop_rate_shortcut=0.1,
    qkv_bias=False
)


In [9]:
from ch04 import LayerNorm

In [10]:
import torch

class TransformerBlock(nn.Module):
    def __init__(self, cfg: ExtendedGPTConfig):
        super().__init__()
        
        self.attn = MultiHeadAttention(
            d_in=cfg.emb_dim,
            d_out=cfg.emb_dim,
            num_heads=cfg.n_heads,
            qkv_bias=cfg.qkv_bias,
            dropout=cfg.drop_rate_attn,
        )
        
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg.emb_dim)
        self.norm2 = LayerNorm(cfg.emb_dim)
        self.drop_shortcut = nn.Dropout(cfg.drop_rate_shortcut)
        
    def forward(self, x: torch.Tensor):
        # attention
        short_cut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + short_cut
        
        # feedforward 
        short_cut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + short_cut
        
        return x

In [11]:
class GPTModel(nn.Module):
    def __init__(self, cfg: ExtendedGPTConfig):
        super().__init__()
        
        self.tok_emb= nn.Embedding(cfg.vocab_size, cfg.emb_dim)
        self.pos_emb= nn.Embedding(cfg.max_context_length, cfg.emb_dim)
        self.drop_emb = nn.Dropout(cfg.drop_rate_emb)
        
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg.n_layers)]
        )
        
        self.final_norm = LayerNorm(cfg.emb_dim)
        self.out_head = nn.Linear(
            cfg.emb_dim, cfg.vocab_size, bias=False
        )
        
    def forward(self, x: torch.Tensor):
        _batch_size, sequence = x.shape
        tok_embbed = self.tok_emb(x)
        in_consecutive_idx = torch.arange(sequence, device=x.device)
        pos_embbed = self.pos_emb(in_consecutive_idx)
        x_embbed = tok_embbed * pos_embbed
        
        x_drop_out = self.drop_emb(x_embbed)
        x = self.trf_blocks(x_drop_out)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [12]:
model = GPTModel(GPT_124M_CONFIG)