# Llama 3.2 1B
- all modules and utils are left in nb for educational purpose
- model params (change from Llama 3.1 8B):
    - vocab size 128256 -> 128256
    - input embedding dim 4096 -> 2048
    - context length 8192 -> 131_000
    - masked grouped-query attention w/ 32 heads -> 32 heads
    - n_layers of the the attention 32 -> 16
    - RoPE -> RoPE scaling
    - final feedfordward layer: Swish + SwiGLU+Linear as gate, hidden layer dim 14336 -> 8192
    - add back weight tying:
        reuses / shares weights between token -> input embedding layer and final feedfordward output layer
- NOTE: 
    - GPT applies the positional embeddings to the inputs
    - Llama applies rotations to the query and key vectors in the self-attention mechanism itself

In [None]:
LLAMA32_1B_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "context_length": 131_072,  # Context length
    "emb_dim": 2048,            # NEW: Half the embedding dimension
    "n_heads": 32,              # Number of attention heads
    "n_layers": 16,             # NEW: Half the number of layers
    "hidden_dim": 8192,         # NEW: Almost half the size of the intermediate dimension in FeedForward
    "n_kv_groups": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 50_000,        # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to save memory
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,         # NEW: Adjustment of the rescaling factor
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}

LLAMA32_3B_CONFIG = {
    "vocab_size": 128_256,      # Vocabulary size
    "context_length": 131_000,  # Context length
    "emb_dim": 3072,            # Embedding dimension
    "n_heads": 24,              # Number of attention heads
    "n_layers": 28,             # Number of layers
    "hidden_dim": 8192,         # Size of the intermediate dimension in FeedForward
    "n_kv_groups": 8,           # Key-Value groups for grouped-query attention
    "rope_base": 50_000,        # The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to save memory
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}

In [None]:
# file name to import functions from 
from helper import import_defs_from_notebook
fullname = "converting-gpt-to-llama2"
names = ["precompute_rope_params", "compute_rope", "SiLU", "FeedForward", "RMSNorm", "MultiHeadAttention"]

imported_module = import_defs_from_notebook(fullname, names)

In [None]:
# We need to redefine precompute_rope_params
# precompute_rope_params = getattr(imported_module, "precompute_rope_params", None)
compute_rope = getattr(imported_module, "compute_rope", None)
SiLU = getattr(imported_module, "SiLU", None)
FeedForward = getattr(imported_module, "FeedForward", None)
RMSNorm = getattr(imported_module, "RMSNorm", None)

# MultiHeadAttention only for comparison purposes
MultiHeadAttention = getattr(imported_module, "MultiHeadAttention", None)