# Transformer Machine Translation Training in Google Colab

This notebook trains your Transformer model with RoPE, Relative Bias, and Sinusoidal positional encodings using Google Colab GPU.

## 1. Setup and Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
import os

# Mount drive
drive.mount('/content/drive')

# Check if files exist
project_path = '/content/drive/MyDrive/anlp_ass1'  # Adjust this path to your project location
print(f"Project path exists: {os.path.exists(project_path)}")
print(f"Contents: {os.listdir('/content/drive/MyDrive') if os.path.exists('/content/drive/MyDrive') else 'Drive not mounted'}")

## 2. Install Dependencies and Setup Environment

In [None]:
# Install required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install tqdm numpy matplotlib seaborn nltk sacrebleu

# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")

## 3. Change to Project Directory and Import Modules

In [None]:
# Change to project directory
import sys
import os

project_path = '/content/drive/MyDrive/anlp_ass1'  # Adjust this path
os.chdir(project_path)
sys.path.append(project_path)

print(f"Current directory: {os.getcwd()}")
print(f"Files in directory: {os.listdir('.')}")

# Import your modules
from encoder import TransformerEncoder
from decoder import TransformerDecoder, Transformer
from utils import (
    load_data, split_data, create_vocabulary, TransformerDataset,
    create_padding_mask, create_look_ahead_mask, calculate_bleu,
    indices_to_sentence, LabelSmoothingLoss
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from pathlib import Path

print("All modules imported successfully!")

## 4. Load and Prepare Data

In [None]:
# Configuration
config = {
    'data_dir': 'EUbookshop',
    'src_file': 'EUbookshop.fi',
    'tgt_file': 'EUbookshop.en',
    'max_seq_len': 128,  # Reduced for Colab GPU memory
    'batch_size': 16,    # Reduced for Colab GPU memory
    'd_model': 256,      # Reduced for Colab GPU memory
    'num_heads': 8,
    'num_encoder_layers': 4,  # Reduced for memory
    'num_decoder_layers': 4,  # Reduced for memory
    'd_ff': 1024,        # Reduced for memory
    'dropout': 0.1,
    'learning_rate': 0.0001,
    'num_epochs': 10,
    'pos_encoding_type': 'rope',  # Change to 'relative_bias' or 'sinusoidal' as needed
    'warmup_steps': 4000,
    'label_smoothing': 0.1,
    'clip_grad_norm': 1.0,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

print("Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

In [None]:
# Load data
print("Loading data...")
src_file_path = os.path.join(config['data_dir'], config['src_file'])
tgt_file_path = os.path.join(config['data_dir'], config['tgt_file'])

data_pairs = load_data(src_file_path, tgt_file_path)
print(f"Loaded {len(data_pairs)} sentence pairs")

# Split data
train_data, val_data, test_data = split_data(data_pairs, train_ratio=0.8, val_ratio=0.1)
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

# Create vocabularies
print("Creating vocabularies...")
src_vocab = create_vocabulary([pair[0] for pair in train_data])
tgt_vocab = create_vocabulary([pair[1] for pair in train_data])

print(f"Source vocabulary size: {len(src_vocab)}")
print(f"Target vocabulary size: {len(tgt_vocab)}")

# Create datasets
train_dataset = TransformerDataset(train_data, src_vocab, tgt_vocab, config['max_seq_len'])
val_dataset = TransformerDataset(val_data, src_vocab, tgt_vocab, config['max_seq_len'])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

## 5. Create Model

In [None]:
# Create model
print(f"Creating model with {config['pos_encoding_type']} positional encoding...")

model = Transformer(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    d_model=config['d_model'],
    num_heads=config['num_heads'],
    num_encoder_layers=config['num_encoder_layers'],
    num_decoder_layers=config['num_decoder_layers'],
    d_ff=config['d_ff'],
    max_seq_len=config['max_seq_len'],
    dropout=config['dropout'],
    pos_encoding_type=config['pos_encoding_type']
).to(config['device'])

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Setup optimizer and scheduler
optimizer = optim.Adam(
    model.parameters(),
    lr=config['learning_rate'],
    betas=(0.9, 0.98),
    eps=1e-9
)

scheduler = optim.lr_scheduler.LambdaLR(
    optimizer,
    lr_lambda=lambda step: min(
        (step + 1) ** -0.5,
        (step + 1) * config['warmup_steps'] ** -1.5
    )
)

criterion = LabelSmoothingLoss(
    num_classes=len(tgt_vocab),
    smoothing=config['label_smoothing'],
    ignore_index=tgt_vocab['<pad>']
)

print("Model and training setup completed!")

## 6. Training Functions

In [None]:
def train_epoch():
    """Train for one epoch"""
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc="Training")
    for step, batch in enumerate(progress_bar):
        src, tgt_input, tgt_output = [b.to(config['device']) for b in batch]

        # Create masks
        src_mask = create_padding_mask(src, src_vocab['<pad>'])
        tgt_mask = create_look_ahead_mask(tgt_input, tgt_vocab['<pad>'])

        # Forward pass
        optimizer.zero_grad()
        output = model(src, tgt_input, src_mask, tgt_mask)

        # Calculate loss
        loss = criterion(output, tgt_output)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['clip_grad_norm'])
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        # Update progress bar
        if step % 100 == 0:
            avg_loss = total_loss / (step + 1)
            current_lr = scheduler.get_last_lr()[0]
            progress_bar.set_postfix({
                'loss': f'{avg_loss:.4f}',
                'lr': f'{current_lr:.2e}'
            })

    return total_loss / len(train_loader)

def validate():
    """Validate the model"""
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            src, tgt_input, tgt_output = [b.to(config['device']) for b in batch]

            # Create masks
            src_mask = create_padding_mask(src, src_vocab['<pad>'])
            tgt_mask = create_look_ahead_mask(tgt_input, tgt_vocab['<pad>'])

            # Forward pass
            output = model(src, tgt_input, src_mask, tgt_mask)
            loss = criterion(output, tgt_output)

            total_loss += loss.item()

    return total_loss / len(val_loader)

def save_checkpoint(epoch, train_loss, val_loss):
    """Save model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'config': config,
        'src_vocab': src_vocab,
        'tgt_vocab': tgt_vocab
    }

    # Create models directory
    os.makedirs('models', exist_ok=True)

    checkpoint_path = f'models/checkpoint_epoch_{epoch}.pt'
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")

    # Save best model
    if not hasattr(save_checkpoint, 'best_val_loss') or val_loss < save_checkpoint.best_val_loss:
        save_checkpoint.best_val_loss = val_loss
        best_model_path = 'models/best_model.pt'
        torch.save(checkpoint, best_model_path)
        print(f"Best model saved: {best_model_path}")

print("Training functions defined!")

## 7. Start Training

In [None]:
# Start training
print(f"Starting training with {config['pos_encoding_type']} positional encoding...")
print(f"Total epochs: {config['num_epochs']}")
print(f"Batch size: {config['batch_size']}")
print(f"Model dimension: {config['d_model']}")
print(f"Device: {config['device']}")

# Training loop
for epoch in range(1, config['num_epochs'] + 1):
    print(f"\nEpoch {epoch}/{config['num_epochs']}")

    # Train
    train_loss = train_epoch()

    # Validate
    val_loss = validate()

    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save checkpoint every epoch
    save_checkpoint(epoch, train_loss, val_loss)

    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("\nTraining completed!")

## 8. Test Different Positional Encodings (Optional)

In [None]:
# Test different positional encodings
pos_encodings = ['rope', 'relative_bias', 'sinusoidal']
results = {}

for pos_encoding in pos_encodings:
    print(f"\n{'='*50}")
    print(f"Testing {pos_encoding} positional encoding")
    print(f"{'='*50}")

    # Update config
    config['pos_encoding_type'] = pos_encoding
    config['num_epochs'] = 3  # Reduced for comparison

    # Create new model
    model = Transformer(
        src_vocab_size=len(src_vocab),
        tgt_vocab_size=len(tgt_vocab),
        d_model=config['d_model'],
        num_heads=config['num_heads'],
        num_encoder_layers=config['num_encoder_layers'],
        num_decoder_layers=config['num_decoder_layers'],
        d_ff=config['d_ff'],
        max_seq_len=config['max_seq_len'],
        dropout=config['dropout'],
        pos_encoding_type=config['pos_encoding_type']
    ).to(config['device'])

    # Setup optimizer and scheduler
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda step: min((step + 1) ** -0.5, (step + 1) * config['warmup_steps'] ** -1.5)
    )

    # Train for a few epochs
    epoch_losses = []
    for epoch in range(1, config['num_epochs'] + 1):
        train_loss = train_epoch()
        val_loss = validate()
        epoch_losses.append((train_loss, val_loss))
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    results[pos_encoding] = epoch_losses

    # Save final model
    save_checkpoint(f"{pos_encoding}_final", train_loss, val_loss)

# Print comparison
print(f"\n{'='*50}")
print("COMPARISON RESULTS")
print(f"{'='*50}")
for pos_encoding, losses in results.items():
    final_train_loss, final_val_loss = losses[-1]
    print(f"{pos_encoding:15}: Final Train Loss: {final_train_loss:.4f}, Final Val Loss: {final_val_loss:.4f}")

## 9. Load and Test Saved Model

In [None]:
# Load best model
checkpoint = torch.load('models/best_model.pt', map_location=config['device'])
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"Loaded best model from epoch {checkpoint['epoch']}")
print(f"Best validation loss: {checkpoint['val_loss']:.4f}")

# Test translation function
def translate_sentence(sentence, max_length=50):
    """Translate a sentence using the trained model"""
    model.eval()

    # Tokenize and convert to indices
    tokens = sentence.lower().split()
    src_indices = [src_vocab.get(token, src_vocab['<unk>']) for token in tokens]
    src_indices = [src_vocab['<start>']] + src_indices + [src_vocab['<end>']]

    # Convert to tensor
    src = torch.tensor([src_indices]).to(config['device'])
    src_mask = create_padding_mask(src, src_vocab['<pad>'])

    # Start with <start> token
    tgt_indices = [tgt_vocab['<start>']]

    for _ in range(max_length):
        tgt = torch.tensor([tgt_indices]).to(config['device'])
        tgt_mask = create_look_ahead_mask(tgt, tgt_vocab['<pad>'])

        with torch.no_grad():
            output = model(src, tgt, src_mask, tgt_mask)
            next_token = output[0, -1].argmax().item()

        tgt_indices.append(next_token)

        if next_token == tgt_vocab['<end>']:
            break

    # Convert back to sentence
    tgt_tokens = [list(tgt_vocab.keys())[list(tgt_vocab.values()).index(idx)]
                  for idx in tgt_indices[1:-1]]  # Skip <start> and <end>

    return ' '.join(tgt_tokens)

# Test some translations
test_sentences = [
    "Hyvää huomenta",  # Good morning
    "Kiitos paljon",   # Thank you very much
    "Nähdään myöhemmin"  # See you later
]

print("\nTest Translations:")
for sentence in test_sentences:
    translation = translate_sentence(sentence)
    print(f"Finnish: {sentence}")
    print(f"English: {translation}")
    print()