In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
# --- Config ---
LLAMA32_CONFIG = {
    "vocab_size": 128_256,
    "context_length": 4096,
    "emb_dim": 2048,
    "n_heads": 32,
    "n_layers": 16,
    "hidden_dim": 8192,
    "n_kv_groups": 8,
    "rope_base": 500_000.0,
    "dtype": torch.bfloat16,
    "rope_freq": {
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}


# RMSNorm (Llama uses RMSNorm)

In [None]:
# --- RMSNorm ---
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def forward(self, x):
        norm = x.pow(2).mean(-1, keepdim=True)
        return x * torch.rsqrt(norm + self.eps) * self.weight


## Rotary Positional Embeddings (RoPE)

In [None]:
# --- RoPE ---
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, base=500_000):
        super().__init__()
        self.dim = dim
        self.base = base
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
    def forward(self, seq_len, device, dtype):
        t = torch.arange(seq_len, device=device, dtype=dtype)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(dtype))
        return torch.cos(freqs), torch.sin(freqs)

def apply_rope(x, cos, sin):
    cos = cos.to(x.dtype)
    sin = sin.to(x.dtype)
    x1, x2 = x[..., ::2], x[..., 1::2]
    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)


# Llama Feed-Forward Network (SwiGLU)

In [None]:
# --- FeedForward ---
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gate = nn.Linear(config["emb_dim"], config["hidden_dim"], bias=False)
        self.up = nn.Linear(config["emb_dim"], config["hidden_dim"], bias=False)
        self.down = nn.Linear(config["hidden_dim"], config["emb_dim"], bias=False)
    def forward(self, x):
        return self.down(F.silu(self.gate(x)) * self.up(x))

# Grouped-Query Multi-Head Attention (GQA)

In [None]:
# --- MultiHeadAttention ---
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config["emb_dim"]
        self.n_heads = config["n_heads"]
        self.n_kv = config["n_kv_groups"]
        self.head_dim = self.emb_dim // self.n_heads

        self.q_proj = nn.Linear(self.emb_dim, self.emb_dim, bias=False)
        self.k_proj = nn.Linear(self.emb_dim, self.n_kv * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.emb_dim, self.n_kv * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.emb_dim, self.emb_dim, bias=False)

        self.rope = RotaryEmbedding(self.head_dim, base=config["rope_base"])

    def forward(self, x):
        B, T, C = x.shape
        dtype = x.dtype

        # Project QKV
        q = self.q_proj(x).view(B, T, self.n_heads, self.head_dim)
        k = self.k_proj(x).view(B, T, self.n_kv, self.head_dim)
        v = self.v_proj(x).view(B, T, self.n_kv, self.head_dim)

        # RoPE
        cos, sin = self.rope(T, x.device, dtype)
        cos, sin = cos[None, :, None, :], sin[None, :, None, :]
        q = apply_rope(q, cos, sin)
        k = apply_rope(k, cos, sin)

        # Expand KV for grouped query
        k = k.repeat_interleave(self.n_heads // self.n_kv, dim=2)
        v = v.repeat_interleave(self.n_heads // self.n_kv, dim=2)

        # Transpose for attention
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        # Attention scores
        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Causal mask
        causal_mask = torch.tril(torch.ones(T, T, device=x.device, dtype=torch.bool))
        attn_scores = attn_scores.masked_fill(~causal_mask, float("-inf"))

        # Softmax & output
        attn_probs = F.softmax(attn_scores, dim=-1)
        out = attn_probs @ v
        out = out.transpose(1,2).contiguous().view(B, T, C)
        return self.o_proj(out)

## Transfomer Blocks ‚Üí Llama 3.2 Model

In [None]:
# --- Transformer Block ---
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn_norm = RMSNorm(config["emb_dim"])
        self.ffn_norm = RMSNorm(config["emb_dim"])
        self.attn = MultiHeadAttention(config)
        self.ffn = FeedForward(config)
    def forward(self, x):
        x = x + self.attn(self.attn_norm(x))
        x = x + self.ffn(self.ffn_norm(x))
        return x

# =========================
# 6Ô∏è‚É£ Llama32Model
# =========================
class Llama32Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config["n_layers"])])
        self.norm = RMSNorm(config["emb_dim"])
        self.lm_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)
    def forward(self, input_ids):
        x = self.embed(input_ids)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        return self.lm_head(x)

# Test dummy input + Load the model on hardware

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Llama32Model(LLAMA32_CONFIG).to(device).to(torch.bfloat16)
dummy_input = torch.randint(
    0,
    LLAMA32_CONFIG["vocab_size"],
    (2, 128),
    device=device
)

with torch.no_grad():
    logits = model(dummy_input)

print("Output shape:", logits.shape)


Output shape: torch.Size([2, 128, 128256])


# Tokenizer from hugging face

In [None]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B"  # or Llama-3.2-1B-Instruct

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_auth_token=""
)

print("Tokenizer loaded!")


# encode a sample sentence into tokens and Decode back to text

In [None]:
text = "Hello world! This is a test of the LLaMA 3.2 tokenizer."

encoded = tokenizer(
    text,
    return_tensors="pt"
)

print("Token IDs:", encoded["input_ids"])


decoded = tokenizer.decode(
    encoded["input_ids"][0],
    skip_special_tokens=True
)

print("Decoded text:", decoded)


Token IDs: tensor([[128000,   9906,   1917,      0,   1115,    374,    264,   1296,    315,
            279,    445,   8921,   4940,    220,     18,     13,     17,  47058,
             13]])
Decoded text: Hello world! This is a test of the LLaMA 3.2 tokenizer.


## tokenizer test for edge cases

In [None]:
from transformers import AutoTokenizer

TOKEN = ""
model_name = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=TOKEN
)

# üîë Explicitly set PAD = EOS (both token and id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

# ---- Edge case batch test ----
texts = ["", " ", "Hello", "üöÄ"]

encoded_batch = tokenizer(
    texts,
    return_tensors="pt",
    padding=True,
    truncation=True
)

decoded_batch = [
    tokenizer.decode(ids, skip_special_tokens=True)
    for ids in encoded_batch["input_ids"]
]

print("pad_token:", tokenizer.pad_token)
print("pad_token_id:", tokenizer.pad_token_id)
print("input_ids:\n", encoded_batch["input_ids"])
print("decoded:", decoded_batch)


pad_token: <|end_of_text|>
pad_token_id: 128001
input_ids:
 tensor([[128000, 128001, 128001, 128001],
        [128000,    220, 128001, 128001],
        [128000,   9906, 128001, 128001],
        [128000,   9468,    248,    222]])
decoded: ['', ' ', 'Hello', 'üöÄ']


### Access to the Llama 3.2 weights + simple test model prediction

In [None]:
!pip install transformers accelerate --quiet


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

HF_TOKEN = ""  # Hugging Face token
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_auth_token=HF_TOKEN
)

# Ensure a pad token is defined for batch handling
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_auth_token=HF_TOKEN,
    torch_dtype=torch.bfloat16  # use BF16 on GPUs that support it
).to(device)

model.eval()
print("Model and tokenizer loaded!")


## Model prediction

In [None]:
input_text = "Once upon a time,"
inputs = tokenizer(input_text, return_tensors="pt").to(device)


In [None]:
with torch.no_grad():
    logits = model(**inputs).logits
    next_token_logits = logits[:, -1, :]  # last token
    next_token_id = torch.argmax(next_token_logits, dim=-1)

predicted_token = tokenizer.decode(next_token_id)
print("Next token prediction:", predicted_token)


Next token prediction:  in


In [None]:
def predict_next_tokens(prompt, max_new_tokens=10):
    """
    Generate tokens iteratively using greedy decoding.

    Args:
        prompt (str): Input text.
        max_new_tokens (int): Number of tokens to predict.

    Returns:
        str: Generated text (prompt + new tokens)
    """
    # Encode prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids).logits  # [batch, seq_len, vocab_size]
            next_token_logits = logits[:, -1, :]  # last token logits
            next_token_id = torch.argmax(next_token_logits, dim=-1)  # greedy
        # Append predicted token
        input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)

    # Decode the entire sequence
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return generated_text


In [None]:
prompt = "Once upon a time,"
generated = predict_next_tokens(prompt, max_new_tokens=20)
print("Generated text:\n", generated)


Generated text:
 Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophia


## Prepare a small dataset

In [None]:
from torch.utils.data import DataLoader, Dataset

class SmallDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.examples = []
        for t in texts:
            enc = tokenizer(t, truncation=True, max_length=max_length, padding="max_length")
            self.examples.append(torch.tensor(enc["input_ids"]))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Sample dataset
texts = [
    "Once upon a time, there was a brave knight.",
    "The stock market showed significant gains today.",
    "Machine learning can predict trends in healthcare.",
    "Artificial intelligence is transforming science."
]

dataset = SmallDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW   # <-- correct
import torch


In [None]:
HF_TOKEN = ""
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_auth_token=HF_TOKEN,
    torch_dtype=torch.bfloat16
).to(device)

model.train()

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
num_epochs = 20
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()

        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/20 - Loss: 3.6103
Epoch 2/20 - Loss: 0.2791
Epoch 3/20 - Loss: 0.1399
Epoch 4/20 - Loss: 0.1112
Epoch 5/20 - Loss: 0.0895
Epoch 6/20 - Loss: 0.0777
Epoch 7/20 - Loss: 0.0679
Epoch 8/20 - Loss: 0.0585
Epoch 9/20 - Loss: 0.0494
Epoch 10/20 - Loss: 0.0371
Epoch 11/20 - Loss: 0.0309
Epoch 12/20 - Loss: 0.0238
Epoch 13/20 - Loss: 0.0258
Epoch 14/20 - Loss: 0.0268
Epoch 15/20 - Loss: 0.0264
Epoch 16/20 - Loss: 0.0265
Epoch 17/20 - Loss: 0.0266
Epoch 18/20 - Loss: 0.0252
Epoch 19/20 - Loss: 0.0254
Epoch 20/20 - Loss: 0.0243


## Weights downloaded and loaded into model.

In [None]:
!pip install huggingface_hub --quiet
!huggingface-cli login


In [None]:
from huggingface_hub import snapshot_download

repo_id = "meta-llama/Llama-3.2-3B-Instruct"
local_dir = "./llama32_weights"

snapshot_download(repo_id, local_dir=local_dir, use_auth_token=True)


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

'/content/llama32_weights'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(local_dir)
tokenizer.pad_token = tokenizer.eos_token

# Load model locally
model = AutoModelForCausalLM.from_pretrained(
    local_dir,
    torch_dtype=torch.bfloat16
).to(device)

model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (

In [None]:
prompt = "Once upon a time,"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)
    next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1)

print("Next token:", tokenizer.decode(next_token_id))


Next token:  in


In [None]:
def predict_next_tokens(prompt, max_new_tokens=10):
    """
    Generate tokens iteratively using greedy decoding.

    Args:
        prompt (str): Input text.
        max_new_tokens (int): Number of tokens to predict.

    Returns:
        str: Generated text (prompt + new tokens)
    """
    # Encode prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids).logits  # [batch, seq_len, vocab_size]
            next_token_logits = logits[:, -1, :]  # last token logits
            next_token_id = torch.argmax(next_token_logits, dim=-1)  # greedy
        # Append predicted token
        input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)

    # Decode the entire sequence
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return generated_text


In [None]:
prompt = "Once upon a time,"
generated = predict_next_tokens(prompt, max_new_tokens=20)
print("Generated text:\n", generated)


Generated text:
 Once upon a time, in a small village nestled in the rolling hills of the countryside, there lived a young girl named Sophia
