In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# RMSNorm (Llama uses RMSNorm)

In [2]:
# --- RMSNorm ---
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def forward(self, x):
        norm = x.pow(2).mean(-1, keepdim=True)
        return x * torch.rsqrt(norm + self.eps) * self.weight


## Rotary Positional Embeddings (RoPE)

In [3]:
# --- RoPE ---
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, base=500_000):
        super().__init__()
        self.dim = dim
        self.base = base
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
    def forward(self, seq_len, device, dtype):
        t = torch.arange(seq_len, device=device, dtype=dtype)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(dtype))
        return torch.cos(freqs), torch.sin(freqs)

def apply_rope(x, cos, sin):
    cos = cos.to(x.dtype)
    sin = sin.to(x.dtype)
    x1, x2 = x[..., ::2], x[..., 1::2]
    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)


# Llama Feed-Forward Network (SwiGLU)

In [4]:
# --- FeedForward ---
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gate = nn.Linear(config["emb_dim"], config["hidden_dim"], bias=False)
        self.up = nn.Linear(config["emb_dim"], config["hidden_dim"], bias=False)
        self.down = nn.Linear(config["hidden_dim"], config["emb_dim"], bias=False)
    def forward(self, x):
        return self.down(F.silu(self.gate(x)) * self.up(x))

# Grouped-Query Multi-Head Attention (GQA)

In [5]:
# --- MultiHeadAttention ---
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config["emb_dim"]
        self.n_heads = config["n_heads"]
        self.n_kv = config["n_kv_groups"]
        self.head_dim = self.emb_dim // self.n_heads

        self.q_proj = nn.Linear(self.emb_dim, self.emb_dim, bias=False)
        self.k_proj = nn.Linear(self.emb_dim, self.n_kv * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.emb_dim, self.n_kv * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.emb_dim, self.emb_dim, bias=False)

        self.rope = RotaryEmbedding(self.head_dim, base=config["rope_base"])

    def forward(self, x):
        B, T, C = x.shape
        dtype = x.dtype

        # Project QKV
        q = self.q_proj(x).view(B, T, self.n_heads, self.head_dim)
        k = self.k_proj(x).view(B, T, self.n_kv, self.head_dim)
        v = self.v_proj(x).view(B, T, self.n_kv, self.head_dim)

        # RoPE
        cos, sin = self.rope(T, x.device, dtype)
        cos, sin = cos[None, :, None, :], sin[None, :, None, :]
        q = apply_rope(q, cos, sin)
        k = apply_rope(k, cos, sin)

        # Expand KV for grouped query
        k = k.repeat_interleave(self.n_heads // self.n_kv, dim=2)
        v = v.repeat_interleave(self.n_heads // self.n_kv, dim=2)

        # Transpose for attention
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        # Attention scores
        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Causal mask
        causal_mask = torch.tril(torch.ones(T, T, device=x.device, dtype=torch.bool))
        attn_scores = attn_scores.masked_fill(~causal_mask, float("-inf"))

        # Softmax & output
        attn_probs = F.softmax(attn_scores, dim=-1)
        out = attn_probs @ v
        out = out.transpose(1,2).contiguous().view(B, T, C)
        return self.o_proj(out)

## Transfomer Blocks ‚Üí Llama 3.2 Model

In [6]:
# --- Transformer Block ---
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn_norm = RMSNorm(config["emb_dim"])
        self.ffn_norm = RMSNorm(config["emb_dim"])
        self.attn = MultiHeadAttention(config)
        self.ffn = FeedForward(config)
    def forward(self, x):
        x = x + self.attn(self.attn_norm(x))
        x = x + self.ffn(self.ffn_norm(x))
        return x

# =========================
# 6Ô∏è‚É£ Llama32Model
# =========================
class Llama32Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config["n_layers"])])
        self.norm = RMSNorm(config["emb_dim"])
        self.lm_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)
    def forward(self, input_ids):
        x = self.embed(input_ids)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        return self.lm_head(x)

In [7]:
# --- Config ---
LLAMA32_CONFIG = {
    "vocab_size": 128_256,
    "context_length": 131_072,
    "emb_dim": 2048,
    "n_heads": 32,
    "n_layers": 16,
    "hidden_dim": 8192,
    "n_kv_groups": 8,
    "rope_base": 500_000.0,
    "dtype": torch.bfloat16,
    "rope_freq": {
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}


## dummy input test

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Llama32Model(LLAMA32_CONFIG).to(device).to(torch.bfloat16)

dummy_input = torch.randint(
    0,
    LLAMA32_CONFIG["vocab_size"],
    (2, 128),
    device=device
)

with torch.no_grad():
    logits = model(dummy_input)

print("Output shape:", logits.shape)


Output shape: torch.Size([2, 128, 128256])


## Tokenizer from hugging face

In [18]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"  # or Llama-3.2-1B-Instruct

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_auth_token=""
)

print("Tokenizer loaded!")


Tokenizer loaded!


## encode a sample sentence into tokens and Decode back to text

In [11]:
text = "Hello world! This is a test of the LLaMA 3.2 tokenizer."

encoded = tokenizer(
    text,
    return_tensors="pt"
)

print("Token IDs:", encoded["input_ids"])


decoded = tokenizer.decode(
    encoded["input_ids"][0],
    skip_special_tokens=True
)

print("Decoded text:", decoded)


Token IDs: tensor([[128000,   9906,   1917,      0,   1115,    374,    264,   1296,    315,
            279,    445,   8921,   4940,    220,     18,     13,     17,  47058,
             13]])
Decoded text: Hello world! This is a test of the LLaMA 3.2 tokenizer.


## tokenizer test for edge cases

In [17]:
from transformers import AutoTokenizer

TOKEN = ""
model_name = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=TOKEN
)

# üîë Explicitly set PAD = EOS (both token and id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

# ---- Edge case batch test ----
texts = ["", " ", "Hello", "üöÄ"]

encoded_batch = tokenizer(
    texts,
    return_tensors="pt",
    padding=True,
    truncation=True
)

decoded_batch = [
    tokenizer.decode(ids, skip_special_tokens=True)
    for ids in encoded_batch["input_ids"]
]

print("pad_token:", tokenizer.pad_token)
print("pad_token_id:", tokenizer.pad_token_id)
print("input_ids:\n", encoded_batch["input_ids"])
print("decoded:", decoded_batch)


pad_token: <|end_of_text|>
pad_token_id: 128001
input_ids:
 tensor([[128000, 128001, 128001, 128001],
        [128000,    220, 128001, 128001],
        [128000,   9906, 128001, 128001],
        [128000,   9468,    248,    222]])
decoded: ['', ' ', 'Hello', 'üöÄ']


In [13]:
from google.colab import output
output.clear()
