In [None]:
# Google Colab Setup
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üîß Running in Google Colab - Setting up environment...")
    if not os.path.exists('transformer_from_scratch'):
        print("üì• Cloning repository...")
        !git clone https://github.com/melhzy/transformer_from_scratch.git
        print("‚úÖ Repository cloned!")
    os.chdir('transformer_from_scratch')
    print("üì¶ Installing dependencies...")
    !pip install -q torch torchvision matplotlib seaborn numpy pandas tqdm tensorboard
    print("‚úÖ Dependencies installed!")
    if '/content/transformer_from_scratch' not in sys.path:
        sys.path.insert(0, '/content/transformer_from_scratch')
    print("‚úÖ Setup complete!")
else:
    print("üíª Running locally - no setup needed.")

In [None]:
# Import libraries
import sys
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import json
import time
from dataclasses import dataclass
import math

if not IN_COLAB:
    sys.path.insert(0, str(Path.cwd().parent))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

# Import our implementations
from src.transformer import Transformer
from src.modules.embeddings import TokenEmbedding, PositionalEncoding

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device: {device}")
print(f"‚úÖ PyTorch version: {torch.__version__}")

if device.type == 'cuda':
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"‚úÖ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 1. Setup: Model, Data, and LoRA üõ†Ô∏è

We'll use simplified versions of our implementations for faster training on Colab.

In [None]:
# Re-use LoRA implementations from Tutorial 2
class LoRALayer(nn.Module):
    """LoRA layer from Tutorial 2"""
    def __init__(self, in_features: int, out_features: int, rank: int = 8, alpha: float = 16.0, dropout: float = 0.0):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        
        self.weight = nn.Parameter(torch.randn(out_features, in_features), requires_grad=False)
        self.bias = nn.Parameter(torch.zeros(out_features), requires_grad=False)
        
        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
        
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        result = F.linear(x, self.weight, self.bias)
        x_lora = self.dropout(x)
        lora_result = F.linear(F.linear(x_lora, self.lora_A), self.lora_B)
        return result + lora_result * self.scaling


# Simple tokenizer from Tutorial 3
from collections import Counter

class SimpleTokenizer:
    """Simple tokenizer from Tutorial 3"""
    def __init__(self, vocab_size: int = 10000):
        self.vocab_size = vocab_size
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.bos_token = "<BOS>"
        self.eos_token = "<EOS>"
        
        self.token2id = {self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3}
        self.id2token = {v: k for k, v in self.token2id.items()}
        self.next_id = 4
        
    def build_vocab(self, texts: List[str]):
        word_freq = Counter()
        for text in texts:
            words = text.lower().split()
            word_freq.update(words)
        
        for word, _ in word_freq.most_common(self.vocab_size - 4):
            if word not in self.token2id:
                self.token2id[word] = self.next_id
                self.id2token[self.next_id] = word
                self.next_id += 1
    
    def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
        words = text.lower().split()
        ids = [self.token2id.get(w, self.token2id[self.unk_token]) for w in words]
        if add_special_tokens:
            ids = [self.token2id[self.bos_token]] + ids + [self.token2id[self.eos_token]]
        return ids
    
    def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
        special_ids = {self.token2id[t] for t in [self.pad_token, self.bos_token, self.eos_token]}
        words = []
        for id in ids:
            if skip_special_tokens and id in special_ids:
                continue
            words.append(self.id2token.get(id, self.unk_token))
        return " ".join(words)
    
    @property
    def pad_token_id(self):
        return self.token2id[self.pad_token]
    
    @property
    def eos_token_id(self):
        return self.token2id[self.eos_token]


print("‚úÖ Helper classes loaded!")

## 2. Prepare Training Data üìö

Create a small instruction dataset for demonstration.

In [None]:
# Instruction dataset (expanded for better training)
instruction_data = [
    {"instruction": "Translate to French", "input": "Hello", "output": "Bonjour"},
    {"instruction": "Translate to French", "input": "Thank you", "output": "Merci"},
    {"instruction": "Translate to Spanish", "input": "Good morning", "output": "Buenos d√≠as"},
    {"instruction": "Translate to Spanish", "input": "How are you", "output": "C√≥mo est√°s"},
    {"instruction": "Answer the question", "input": "What is 2+2?", "output": "2+2 equals 4"},
    {"instruction": "Answer the question", "input": "What is the capital of France?", "output": "The capital of France is Paris"},
    {"instruction": "Summarize", "input": "The quick brown fox jumps over the lazy dog", "output": "A fox jumps over a dog"},
    {"instruction": "Complete the sentence", "input": "The weather today is", "output": "The weather today is sunny and warm"},
    {"instruction": "Explain", "input": "What is AI?", "output": "AI stands for Artificial Intelligence, which enables machines to learn and perform tasks"},
    {"instruction": "Explain", "input": "What is Python?", "output": "Python is a popular programming language known for its simplicity and versatility"},
]

# Build tokenizer
all_texts = []
for item in instruction_data:
    all_texts.extend([item['instruction'], item['input'], item['output']])

tokenizer = SimpleTokenizer(vocab_size=500)
tokenizer.build_vocab(all_texts)

print(f"‚úÖ Tokenizer built: {len(tokenizer.token2id)} tokens")
print(f"‚úÖ Training examples: {len(instruction_data)}")

In [None]:
# Dataset class from Tutorial 3
class InstructionDataset(Dataset):
    def __init__(self, data: List[Dict], tokenizer: SimpleTokenizer, max_length: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = f"Instruction: {item['instruction']} Input: {item['input']} Output:"
        completion = item['output']
        
        prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
        completion_ids = self.tokenizer.encode(completion, add_special_tokens=False)
        
        input_ids = (
            [self.tokenizer.token2id[self.tokenizer.bos_token]] +
            prompt_ids + completion_ids +
            [self.tokenizer.token2id[self.tokenizer.eos_token]]
        )
        
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
        
        labels = ([-100] * (len(prompt_ids) + 1) + completion_ids + [self.tokenizer.eos_token_id])
        if len(labels) > self.max_length:
            labels = labels[:self.max_length]
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }


@dataclass
class DataCollator:
    tokenizer: SimpleTokenizer
    max_length: int = 128
    
    def __call__(self, examples: List[Dict]) -> Dict[str, torch.Tensor]:
        batch_max = min(max(len(ex['input_ids']) for ex in examples), self.max_length)
        
        input_ids, labels, attention_mask = [], [], []
        for ex in examples:
            seq_len = len(ex['input_ids'])
            padding_len = batch_max - seq_len
            
            padded_input = torch.cat([
                ex['input_ids'],
                torch.full((padding_len,), self.tokenizer.pad_token_id, dtype=torch.long)
            ])
            input_ids.append(padded_input)
            
            padded_labels = torch.cat([
                ex['labels'],
                torch.full((padding_len,), -100, dtype=torch.long)
            ])
            labels.append(padded_labels)
            
            mask = torch.cat([
                torch.ones(seq_len, dtype=torch.long),
                torch.zeros(padding_len, dtype=torch.long)
            ])
            attention_mask.append(mask)
        
        return {
            'input_ids': torch.stack(input_ids),
            'labels': torch.stack(labels),
            'attention_mask': torch.stack(attention_mask)
        }


# Create dataset and dataloader
train_dataset = InstructionDataset(instruction_data, tokenizer, max_length=128)
collator = DataCollator(tokenizer, max_length=128)
train_dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=collator, shuffle=True)

print(f"‚úÖ DataLoader created: {len(train_dataloader)} batches")

# Test batch
sample_batch = next(iter(train_dataloader))
print(f"  Batch input_ids shape: {sample_batch['input_ids'].shape}")
print(f"  Batch labels shape: {sample_batch['labels'].shape}")

## 3. Create Model with LoRA ü§ñ

We'll create a small Transformer and apply LoRA to its attention layers.

In [None]:
# Small model configuration for Colab
config = {
    'vocab_size': len(tokenizer.token2id),
    'd_model': 128,
    'n_heads': 4,
    'n_layers': 2,
    'd_ff': 512,
    'dropout': 0.1,
    'max_seq_len': 128,
}

# Create base model
model = Transformer(
    src_vocab_size=config['vocab_size'],
    tgt_vocab_size=config['vocab_size'],
    d_model=config['d_model'],
    n_heads=config['n_heads'],
    n_layers=config['n_layers'],
    d_ff=config['d_ff'],
    dropout=config['dropout'],
    max_seq_len=config['max_seq_len']
).to(device)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Apply LoRA to attention projections
lora_rank = 8
lora_alpha = 16.0

def apply_lora_to_attention(model, rank=8, alpha=16.0):
    """Replace attention projections with LoRA layers"""
    for module in model.modules():
        if hasattr(module, 'W_q') and isinstance(module.W_q, nn.Linear):
            # Replace Q and V projections with LoRA
            d_model = module.W_q.in_features
            
            # Copy weights to LoRA layers
            lora_q = LoRALayer(d_model, d_model, rank, alpha)
            lora_q.weight.data = module.W_q.weight.data.clone()
            if module.W_q.bias is not None:
                lora_q.bias.data = module.W_q.bias.data.clone()
            module.W_q = lora_q
            
            lora_v = LoRALayer(d_model, d_model, rank, alpha)
            lora_v.weight.data = module.W_v.weight.data.clone()
            if module.W_v.bias is not None:
                lora_v.bias.data = module.W_v.bias.data.clone()
            module.W_v = lora_v

apply_lora_to_attention(model, lora_rank, lora_alpha)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"‚úÖ Model created and LoRA applied!")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable (LoRA): {trainable_params:,}")
print(f"  Trainable ratio: {trainable_params/total_params*100:.2f}%")

## 4. Training Configuration üéì

Set up optimizer, scheduler, and training hyperparameters.

### Learning Rate Schedule:

We use **warmup + cosine decay** as recommended in the Transformer paper:

$$\text{lr}(t) = \begin{cases}
\text{lr}_{\text{max}} \cdot \frac{t}{T_{\text{warmup}}} & t < T_{\text{warmup}} \\
\text{lr}_{\text{min}} + (\text{lr}_{\text{max}} - \text{lr}_{\text{min}}) \cdot \frac{1 + \cos(\pi \cdot \frac{t - T_{\text{warmup}}}{T_{\text{total}} - T_{\text{warmup}}})}{2} & t \geq T_{\text{warmup}}
\end{cases}$$

In [None]:
# Training configuration
training_config = {
    'num_epochs': 20,
    'learning_rate': 1e-3,
    'weight_decay': 0.01,
    'warmup_steps': 20,
    'gradient_accumulation_steps': 2,
    'max_grad_norm': 1.0,
    'log_interval': 5,
}

# Optimizer (only LoRA parameters)
optimizer = AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=training_config['learning_rate'],
    weight_decay=training_config['weight_decay'],
    betas=(0.9, 0.999)
)

# Learning rate scheduler (warmup + cosine)
total_steps = len(train_dataloader) * training_config['num_epochs']
warmup_scheduler = LinearLR(
    optimizer,
    start_factor=0.01,
    end_factor=1.0,
    total_iters=training_config['warmup_steps']
)
cosine_scheduler = CosineAnnealingLR(
    optimizer,
    T_max=total_steps - training_config['warmup_steps'],
    eta_min=training_config['learning_rate'] * 0.1
)
scheduler = SequentialLR(
    optimizer,
    schedulers=[warmup_scheduler, cosine_scheduler],
    milestones=[training_config['warmup_steps']]
)

print(f"‚úÖ Training configuration:")
print(f"  Epochs: {training_config['num_epochs']}")
print(f"  Learning rate: {training_config['learning_rate']}")
print(f"  Warmup steps: {training_config['warmup_steps']}")
print(f"  Total steps: {total_steps}")
print(f"  Gradient accumulation: {training_config['gradient_accumulation_steps']}")

## 5. Training Loop üöÄ

Implement complete training loop with:
- Gradient accumulation
- Gradient clipping
- Learning rate scheduling
- Loss tracking

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, config, epoch):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}")
    
    for step, batch in pbar:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass (language modeling)
        # For decoder-only: use input_ids as both src and tgt
        src = input_ids[:, :-1]  # Remove last token
        tgt = input_ids[:, :-1]  # Same as src for decoder-only
        
        logits = model(src, tgt)
        
        # Compute loss
        logits_flat = logits.reshape(-1, logits.size(-1))
        labels_flat = labels[:, 1:].reshape(-1)  # Shift labels
        
        loss = F.cross_entropy(logits_flat, labels_flat, ignore_index=-100)
        
        # Scale loss for gradient accumulation
        loss = loss / config['gradient_accumulation_steps']
        loss.backward()
        
        # Update weights every N steps
        if (step + 1) % config['gradient_accumulation_steps'] == 0:
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(
                [p for p in model.parameters() if p.requires_grad],
                config['max_grad_norm']
            )
            
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * config['gradient_accumulation_steps']
        
        # Update progress bar
        if (step + 1) % config['log_interval'] == 0:
            avg_loss = total_loss / (step + 1)
            current_lr = scheduler.get_last_lr()[0]
            pbar.set_postfix({'loss': f'{avg_loss:.4f}', 'lr': f'{current_lr:.6f}'})
    
    return total_loss / len(dataloader)


# Training loop
print("\nüöÄ Starting training...\n")
history = {'loss': [], 'lr': []}

for epoch in range(training_config['num_epochs']):
    epoch_loss = train_epoch(model, train_dataloader, optimizer, scheduler, training_config, epoch)
    current_lr = scheduler.get_last_lr()[0]
    
    history['loss'].append(epoch_loss)
    history['lr'].append(current_lr)
    
    print(f"Epoch {epoch+1}/{training_config['num_epochs']} - Loss: {epoch_loss:.4f}, LR: {current_lr:.6f}")

print("\n‚úÖ Training complete!")

## 6. Visualize Training üìä

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Loss curve
epochs = range(1, len(history['loss']) + 1)
ax1.plot(epochs, history['loss'], marker='o', linewidth=2, markersize=6, color='blue')
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training Loss', fontsize=14, fontweight='bold')
ax1.grid(alpha=0.3)

# Learning rate schedule
ax2.plot(epochs, history['lr'], marker='o', linewidth=2, markersize=6, color='orange')
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Learning Rate', fontsize=12)
ax2.set_title('Learning Rate Schedule', fontsize=14, fontweight='bold')
ax2.grid(alpha=0.3)
ax2.ticklabel_format(axis='y', style='scientific', scilimits=(0,0))

plt.tight_layout()
plt.show()

print(f"Final loss: {history['loss'][-1]:.4f}")
print(f"Loss reduction: {(1 - history['loss'][-1]/history['loss'][0])*100:.1f}%")

## 7. Generate Text with Fine-Tuned Model üé®

Test our fine-tuned model with greedy decoding.

In [None]:
def generate_text(
    model,
    tokenizer,
    prompt: str,
    max_length: int = 50,
    temperature: float = 1.0
) -> str:
    """
    Generate text using the fine-tuned model.
    
    Args:
        model: Fine-tuned transformer
        tokenizer: Tokenizer
        prompt: Input prompt
        max_length: Maximum generation length
        temperature: Sampling temperature (1.0 = greedy)
    """
    model.eval()
    
    # Encode prompt
    input_ids = tokenizer.encode(prompt, add_special_tokens=True)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    
    generated = input_ids.copy()
    
    with torch.no_grad():
        for _ in range(max_length):
            # Prepare input
            src = input_tensor[:, :-1]
            tgt = input_tensor[:, :-1]
            
            # Forward pass
            logits = model(src, tgt)
            
            # Get next token prediction
            next_token_logits = logits[0, -1, :] / temperature
            next_token_id = torch.argmax(next_token_logits).item()
            
            # Stop at EOS
            if next_token_id == tokenizer.eos_token_id:
                break
            
            # Append to generated sequence
            generated.append(next_token_id)
            input_tensor = torch.tensor([generated], dtype=torch.long).to(device)
    
    # Decode
    return tokenizer.decode(generated, skip_special_tokens=True)


# Test generation on training examples
print("\nüé® Testing generation...\n")

test_prompts = [
    "Instruction: Translate to French Input: Hello Output:",
    "Instruction: Answer the question Input: What is 2+2? Output:",
    "Instruction: Translate to Spanish Input: Good morning Output:",
]

for prompt in test_prompts:
    generated = generate_text(model, tokenizer, prompt, max_length=20)
    print(f"Prompt: {prompt}")
    print(f"Generated: {generated}")
    print("-" * 80)

## 8. Save and Load Checkpoints üíæ

In [None]:
def save_lora_checkpoint(model, optimizer, epoch, loss, path='lora_checkpoint.pt'):
    """
    Save only LoRA parameters (much smaller than full model).
    """
    # Extract LoRA parameters
    lora_state_dict = {
        k: v for k, v in model.state_dict().items()
        if 'lora' in k.lower()
    }
    
    checkpoint = {
        'epoch': epoch,
        'lora_state_dict': lora_state_dict,
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }
    
    torch.save(checkpoint, path)
    print(f"‚úÖ Checkpoint saved: {path}")
    
    # Print size comparison
    full_size = sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024  # MB
    lora_size = sum(v.numel() for v in lora_state_dict.values()) * 4 / 1024 / 1024  # MB
    print(f"  Full model: {full_size:.2f} MB")
    print(f"  LoRA only: {lora_size:.2f} MB ({lora_size/full_size*100:.1f}%)")


def load_lora_checkpoint(model, optimizer, path='lora_checkpoint.pt'):
    """
    Load LoRA parameters from checkpoint.
    """
    checkpoint = torch.load(path, map_location=device)
    
    # Load LoRA parameters (strict=False to ignore missing keys)
    model.load_state_dict(checkpoint['lora_state_dict'], strict=False)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    print(f"‚úÖ Checkpoint loaded: {path}")
    print(f"  Epoch: {checkpoint['epoch']}")
    print(f"  Loss: {checkpoint['loss']:.4f}")
    
    return checkpoint['epoch'], checkpoint['loss']


# Save checkpoint
save_lora_checkpoint(
    model,
    optimizer,
    epoch=training_config['num_epochs'],
    loss=history['loss'][-1],
    path='lora_instruction_tuned.pt'
)

## 9. Summary & Best Practices üìù

### What We Learned:

‚úÖ Complete instruction tuning pipeline  
‚úÖ Training with LoRA for parameter efficiency  
‚úÖ Learning rate scheduling (warmup + cosine)  
‚úÖ Gradient accumulation for larger effective batch sizes  
‚úÖ Gradient clipping for training stability  
‚úÖ Text generation with fine-tuned model  
‚úÖ Checkpoint saving (LoRA-only for efficiency)  

### Training Best Practices:

1. **Learning Rate**
   - LoRA: 1e-4 to 1e-3 (higher than full fine-tuning)
   - Full fine-tuning: 1e-5 to 5e-5
   - Always use warmup (10-20% of training)

2. **Batch Size**
   - Effective batch size: 16-64 for most tasks
   - Use gradient accumulation if limited GPU memory
   - Formula: `effective_batch_size = batch_size √ó grad_accum_steps √ó num_gpus`

3. **Gradient Clipping**
   - Essential for training stability
   - Max norm: 0.5-1.0

4. **Checkpointing**
   - Save LoRA weights only (much smaller)
   - Save every N epochs or best validation loss
   - Include optimizer state for resuming training

5. **Monitoring**
   - Track loss, learning rate, gradient norms
   - Use TensorBoard or Weights & Biases
   - Validate on held-out set regularly

### Common Issues:

- **Loss not decreasing**: Learning rate too high/low, check data quality
- **NaN loss**: Gradient exploding (use gradient clipping), learning rate too high
- **Overfitting**: Reduce model size, increase regularization, add more data
- **OOM errors**: Reduce batch size, enable gradient checkpointing, use smaller model

### Next Steps:

- **Tutorial 5**: Evaluation metrics and model assessment
- Scale to larger models (7B, 13B) with DeepSpeed/FSDP
- Try different LoRA configurations
- Experiment with QLoRA for even lower memory

---

## üìö Resources

**Papers:**
- [papers/DeepSeek-R1-paper.pdf](../papers/DeepSeek-R1-paper.pdf) - Training methodology
- Instruction Tuning: https://arxiv.org/abs/2109.01652
- LoRA: https://arxiv.org/abs/2106.09685

**Code:**
- Our implementation: [src/transformer.py](../src/transformer.py)
- Hugging Face Transformers: https://github.com/huggingface/transformers
- Hugging Face PEFT: https://github.com/huggingface/peft

**Related:**
- [transformer-foundation/06_complete_transformer.ipynb](../transformer-foundation/06_complete_transformer.ipynb)
- [02_lora_implementation.ipynb](02_lora_implementation.ipynb)
- [03_data_preparation.ipynb](03_data_preparation.ipynb)

---

**Ready to evaluate your model? Continue to Tutorial 5! üöÄ**