In [1]:
"""
==================================================================================
PYTORCH SEQ2SEQ TRANSLATOR - FIXED VERSION
==================================================================================
English → Spanish Neural Machine Translation

KEY IMPROVEMENTS FROM PREVIOUS VERSION:
1. ✅ Reduced vocabulary (16K → 5K) for better learning
2. ✅ Increased model capacity (128 → 256 hidden units)
3. ✅ Bidirectional encoder for better context understanding
4. ✅ Removed problematic GloVe embeddings
5. ✅ Better training configuration
6. ✅ Quality over quantity approach

Expected Results:
- BLEU Score: 8-12 (vs previous 3-4)
- Coherent Spanish translations
- No infinite loops or garbage output
==================================================================================
"""

# ==================================================================================
# 1. IMPORTS AND SETUP
# ==================================================================================

import re
import pickle
import logging
from pathlib import Path
from io import StringIO

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# NLTK for tokenization
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
from nltk.tokenize import word_tokenize

from collections import Counter
from sklearn.model_selection import train_test_split

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")



Using device: cuda


# ==================================================================================
# 2. IMPROVED HYPERPARAMETERS
# ==================================================================================

In [2]:


# Data configuration - REDUCED for better learning
MAX_NUM_SENTENCES = 50000  # Down from 100K - focus on quality
MAX_VOCAB_SIZE = 5000      # Down from 16K - ensure each word is learned well
MIN_WORD_FREQ = 2          # Filter rare words

# Model architecture - INCREASED capacity
HIDDEN_DIM = 256    # Up from 128 - more capacity
EMBED_DIM = 128     # Up from 50 - richer representations

# Training configuration
LEARNING_RATE = 0.001
DROPOUT = 0.1
N_EPOCHS = 25
BATCH_SIZE = 64
GRAD_CLIP = 1.0

# Learning rate scheduler
USE_LR_SCHEDULER = True
LR_SCHEDULER_PATIENCE = 3
LR_SCHEDULER_FACTOR = 0.5
LR_SCHEDULER_MIN_LR = 0.0001

print("\n" + "="*80)
print("CONFIGURATION")
print("="*80)
print(f"Data: {MAX_NUM_SENTENCES:,} sentences, {MAX_VOCAB_SIZE:,} vocab")
print(f"Model: {HIDDEN_DIM} hidden units, {EMBED_DIM} embedding dim")
print(f"Training: {N_EPOCHS} epochs, LR={LEARNING_RATE}, batch={BATCH_SIZE}")
print("="*80 + "\n")





CONFIGURATION
Data: 50,000 sentences, 5,000 vocab
Model: 256 hidden units, 128 embedding dim
Training: 25 epochs, LR=0.001, batch=64



# ==================================================================================
# 3. DATA LOADING AND PREPROCESSING
# ==================================================================================

In [3]:


# Download dataset if needed
import os
if not os.access('spa-eng', os.F_OK):
    if not os.access('spa-eng.zip', os.F_OK):
        print("Downloading dataset...")
        import urllib.request
        urllib.request.urlretrieve(
            'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
            'spa-eng.zip'
        )
    print("Extracting dataset...")
    import zipfile
    with zipfile.ZipFile('spa-eng.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    print("Dataset ready!")

# Load and shuffle data
text_file = "./spa-eng/spa.txt"
with open(text_file, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]

np.random.seed(42)
np.random.shuffle(lines)

# Parse sentences with quality filtering
input_sentences = []
output_sentences = []
output_sentences_inputs = []

print("Parsing sentences with quality filters...")

for i, line in enumerate(lines):
    if i >= MAX_NUM_SENTENCES:
        break
    if '\t' not in line:
        continue
    
    input_sentence, output = line.rstrip().split('\t')
    
    # Quality filters
    input_words = input_sentence.split()
    output_words = output.split()
    
    # Skip very short or very long sentences
    if len(input_words) < 2 or len(input_words) > 20:
        continue
    if len(output_words) < 2 or len(output_words) > 20:
        continue
    
    # Add special tokens
    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output
    
    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print(f"Loaded {len(input_sentences):,} quality-filtered sentences")


Parsing sentences with quality filters...
Loaded 49,650 quality-filtered sentences


# ==================================================================================
# 4. VOCABULARY BUILDING
# ==================================================================================

In [4]:


class SimpleVocab:
    """Simple vocabulary class with frequency filtering"""
    def __init__(self, counter, max_size=None, min_freq=1, specials=['<pad>', '<unk>']):
        self.itos = specials.copy()
        self.stoi = {token: idx for idx, token in enumerate(self.itos)}
        self.unk_index = self.stoi.get('<unk>', 1)
        
        # Sort by frequency
        sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        
        # Build vocabulary
        for token, freq in sorted_tokens:
            if token in self.stoi:
                continue
            if freq < min_freq:
                break
            if max_size and len(self.itos) >= max_size:
                break
            self.stoi[token] = len(self.itos)
            self.itos.append(token)
    
    def __len__(self):
        return len(self.itos)
    
    def __getitem__(self, token):
        return self.stoi.get(token, self.unk_index)
    
    def get_itos(self):
        return self.itos
    
    def get_stoi(self):
        return self.stoi

def simple_tokenizer(text, language='en'):
    """NLTK-based tokenizer"""
    try:
        tokens = word_tokenize(text.lower())
    except:
        tokens = text.lower().split()
    return tokens

# Build vocabularies
print("\nBuilding vocabularies...")

# English vocabulary
en_counter = Counter()
for sentence in input_sentences:
    tokens = simple_tokenizer(sentence, language='en')
    en_counter.update(tokens)

en_vocab = SimpleVocab(
    en_counter,
    max_size=MAX_VOCAB_SIZE,
    min_freq=MIN_WORD_FREQ,
    specials=['<pad>', '<unk>']
)

# Spanish vocabulary
es_counter = Counter()
for sentence in output_sentences + output_sentences_inputs:
    tokens = simple_tokenizer(sentence, language='es')
    es_counter.update(tokens)

es_vocab = SimpleVocab(
    es_counter,
    max_size=MAX_VOCAB_SIZE,
    min_freq=MIN_WORD_FREQ,
    specials=['<pad>', '<unk>', '<sos>', '<eos>']
)

print(f"English vocabulary: {len(en_vocab):,} words")
print(f"Spanish vocabulary: {len(es_vocab):,} words")
print(f"Coverage: ~{len(input_sentences) / len(en_vocab):.1f} sentences per EN word")
print(f"Coverage: ~{len(output_sentences) / len(es_vocab):.1f} sentences per ES word")




Building vocabularies...
English vocabulary: 5,000 words
Spanish vocabulary: 5,000 words
Coverage: ~9.9 sentences per EN word
Coverage: ~9.9 sentences per ES word


# ==================================================================================
# 5. DATASET AND DATALOADERS
# ==================================================================================

In [5]:


def text_to_indices(text, vocab, language='en'):
    """Convert text to indices"""
    tokens = simple_tokenizer(text, language=language)
    return [vocab[token] for token in tokens]

class TranslationDataset(Dataset):
    """Dataset for translation pairs"""
    def __init__(self, input_sentences, output_sentences, output_sentences_inputs,
                 en_vocab, es_vocab):
        self.input_sentences = input_sentences
        self.output_sentences = output_sentences
        self.output_sentences_inputs = output_sentences_inputs
        self.en_vocab = en_vocab
        self.es_vocab = es_vocab
    
    def __len__(self):
        return len(self.input_sentences)
    
    def __getitem__(self, idx):
        enc_input = text_to_indices(
            self.input_sentences[idx],
            self.en_vocab,
            language='en'
        )
        
        dec_input = text_to_indices(
            self.output_sentences_inputs[idx],
            self.es_vocab,
            language='es'
        )
        
        dec_target = text_to_indices(
            self.output_sentences[idx],
            self.es_vocab,
            language='es'
        )
        
        return (
            torch.tensor(enc_input, dtype=torch.long),
            torch.tensor(dec_input, dtype=torch.long),
            torch.tensor(dec_target, dtype=torch.long)
        )

def collate_fn(batch):
    """Collate function with padding"""
    enc_inputs, dec_inputs, dec_targets = zip(*batch)
    
    enc_inputs_padded = pad_sequence(enc_inputs, batch_first=True, padding_value=en_vocab['<pad>'])
    dec_inputs_padded = pad_sequence(dec_inputs, batch_first=True, padding_value=es_vocab['<pad>'])
    dec_targets_padded = pad_sequence(dec_targets, batch_first=True, padding_value=es_vocab['<pad>'])
    
    return enc_inputs_padded, dec_inputs_padded, dec_targets_padded

# Split data
train_input, val_input, train_output, val_output, train_output_inp, val_output_inp = train_test_split(
    input_sentences, output_sentences, output_sentences_inputs,
    test_size=0.2, random_state=42
)

# Create datasets
train_dataset = TranslationDataset(
    train_input, train_output, train_output_inp,
    en_vocab, es_vocab
)

val_dataset = TranslationDataset(
    val_input, val_output, val_output_inp,
    en_vocab, es_vocab
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

print(f"\nDataset split:")
print(f"  Training: {len(train_dataset):,} pairs ({len(train_loader)} batches)")
print(f"  Validation: {len(val_dataset):,} pairs ({len(val_loader)} batches)")







Dataset split:
  Training: 39,720 pairs (621 batches)
  Validation: 9,930 pairs (156 batches)



# ==================================================================================
# 6. IMPROVED MODEL ARCHITECTURE
# ==================================================================================

In [6]:

class ImprovedEncoder(nn.Module):
    """
    Improved bidirectional encoder with better capacity
    
    Key improvements:
    - Bidirectional LSTM for context from both directions
    - No pretrained embeddings (learns from scratch)
    - Larger hidden dimension
    - Proper projection of bidirectional states
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        # Learnable embeddings (no GloVe)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(dropout)
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            bidirectional=True,
            batch_first=True
        )
        self.lstm_dropout = nn.Dropout(dropout)
        
        # Project bidirectional hidden states to decoder dimension
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)
    
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding_dropout(self.embedding(x))
        
        # BiLSTM outputs
        outputs, (hidden, cell) = self.lstm(embedded)
        outputs = self.lstm_dropout(outputs)
        
        # Combine forward and backward states
        # hidden: (2, batch, hidden_dim) -> (batch, hidden_dim*2)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        cell = torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1)
        
        # Project to decoder dimension
        hidden = torch.tanh(self.fc_hidden(hidden)).unsqueeze(0)
        cell = torch.tanh(self.fc_cell(cell)).unsqueeze(0)
        
        return outputs, hidden, cell

class Decoder(nn.Module):
    """Standard unidirectional decoder"""
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.lstm_dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden, cell):
        embedded = self.embedding_dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        outputs = self.lstm_dropout(outputs)
        predictions = self.fc(outputs)
        return predictions, hidden, cell

class Attention(nn.Module):
    """Bahdanau attention mechanism"""
    def __init__(self, hidden_dim):
        super().__init__()
        # Note: encoder outputs are bidirectional (hidden_dim*2)
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)  # *3 because encoder is bidirectional
        self.v = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, hidden, encoder_outputs):
        # hidden: (1, batch, hidden_dim)
        # encoder_outputs: (batch, src_len, hidden_dim*2) from BiLSTM
        
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat hidden for each source position
        hidden = hidden.permute(1, 0, 2).repeat(1, src_len, 1)
        
        # Compute attention energies
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        
        return torch.softmax(attention, dim=1)

class DecoderWithAttention(nn.Module):
    """Decoder with attention mechanism"""
    def __init__(self, vocab_size, embed_dim, hidden_dim, attention, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(dropout)
        
        # LSTM input: embedding + context (encoder is bidirectional)
        self.lstm = nn.LSTM(embed_dim + hidden_dim * 2, hidden_dim, batch_first=True)
        self.lstm_dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden, cell, encoder_outputs):
        embedded = self.embedding_dropout(self.embedding(x))
        
        seq_len = x.shape[1]
        outputs = []
        
        for t in range(seq_len):
            # Attention
            attn_weights = self.attention(hidden, encoder_outputs).unsqueeze(1)
            context = torch.bmm(attn_weights, encoder_outputs)
            
            # Concatenate embedding and context
            rnn_input = torch.cat((embedded[:, t:t+1, :], context), dim=2)
            
            # LSTM step
            output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
            output = self.lstm_dropout(output)
            outputs.append(output)
        
        outputs = torch.cat(outputs, dim=1)
        predictions = self.fc(outputs)
        
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    """Basic Seq2Seq model"""
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        _, hidden, cell = self.encoder(src)
        predictions, _, _ = self.decoder(trg, hidden, cell)
        return predictions

class Seq2SeqWithAttention(nn.Module):
    """Seq2Seq with attention"""
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        enc_outputs, hidden, cell = self.encoder(src)
        predictions, _, _ = self.decoder(trg, hidden, cell, enc_outputs)
        return predictions

# ==================================================================================
# 7. TRAINING FUNCTIONS
# ==================================================================================

In [7]:


def train_epoch(model, dataloader, optimizer, criterion, device, clip=1.0):
    """Train one epoch"""
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for enc_inp, dec_inp, dec_tgt in dataloader:
        enc_inp = enc_inp.to(device)
        dec_inp = dec_inp.to(device)
        dec_tgt = dec_tgt.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(enc_inp, dec_inp)
        
        # Calculate loss
        output_dim = output.shape[-1]
        output = output.reshape(-1, output_dim)
        dec_tgt = dec_tgt.reshape(-1)
        
        loss = criterion(output, dec_tgt)
        
        # Calculate accuracy
        non_pad = (dec_tgt != es_vocab['<pad>']).sum()
        correct = ((output.argmax(1) == dec_tgt) & (dec_tgt != es_vocab['<pad>'])).sum()
        acc = correct.float() / non_pad.float()
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate model"""
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for enc_inp, dec_inp, dec_tgt in dataloader:
            enc_inp = enc_inp.to(device)
            dec_inp = dec_inp.to(device)
            dec_tgt = dec_tgt.to(device)
            
            output = model(enc_inp, dec_inp)
            
            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            dec_tgt = dec_tgt.reshape(-1)
            
            loss = criterion(output, dec_tgt)
            
            non_pad = (dec_tgt != es_vocab['<pad>']).sum()
            correct = ((output.argmax(1) == dec_tgt) & (dec_tgt != es_vocab['<pad>'])).sum()
            acc = correct.float() / non_pad.float()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

def calculate_perplexity(loss):
    """Calculate perplexity from loss"""
    return np.exp(loss)

# ==================================================================================
# 8. INFERENCE FUNCTIONS
# ==================================================================================

In [8]:
def greedy_decode(model, src_sentence, max_len=50):
    """Greedy decoding (always pick most probable token)"""
    model.eval()
    
    with torch.no_grad():
        src_indices = text_to_indices(src_sentence, en_vocab, language='en')
        src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)
        
        if isinstance(model, Seq2SeqWithAttention):
            enc_outputs, hidden, cell = model.encoder(src_tensor)
        else:
            _, hidden, cell = model.encoder(src_tensor)
            enc_outputs = None
        
        trg_indices = [es_vocab['<sos>']]
        
        for _ in range(max_len):
            trg_tensor = torch.LongTensor([trg_indices[-1]]).unsqueeze(0).to(device)
            
            if isinstance(model, Seq2SeqWithAttention):
                output, hidden, cell = model.decoder(trg_tensor, hidden, cell, enc_outputs)
            else:
                output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
            
            pred_token = output.argmax(2).item()
            trg_indices.append(pred_token)
            
            if pred_token == es_vocab['<eos>']:
                break
        
        trg_tokens = [es_vocab.get_itos()[idx] for idx in trg_indices[1:-1]]
        return ' '.join(trg_tokens)

def beam_search_decode(model, src_sentence, beam_width=5, max_len=50):
    """Beam search decoding"""
    model.eval()
    
    with torch.no_grad():
        src_indices = text_to_indices(src_sentence, en_vocab, language='en')
        src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)
        
        if isinstance(model, Seq2SeqWithAttention):
            enc_outputs, hidden, cell = model.encoder(src_tensor)
        else:
            _, hidden, cell = model.encoder(src_tensor)
            enc_outputs = None
        
        beams = [([es_vocab['<sos>']], 0.0, hidden, cell)]
        
        for _ in range(max_len):
            all_candidates = []
            
            for seq, score, h, c in beams:
                if seq[-1] == es_vocab['<eos>']:
                    all_candidates.append((seq, score, h, c))
                    continue
                
                trg_tensor = torch.LongTensor([seq[-1]]).unsqueeze(0).to(device)
                
                if isinstance(model, Seq2SeqWithAttention):
                    output, new_h, new_c = model.decoder(trg_tensor, h, c, enc_outputs)
                else:
                    output, new_h, new_c = model.decoder(trg_tensor, h, c)
                
                log_probs = torch.log_softmax(output[0, 0], dim=0)
                top_k_probs, top_k_indices = torch.topk(log_probs, beam_width)
                
                for prob, idx in zip(top_k_probs, top_k_indices):
                    new_seq = seq + [idx.item()]
                    new_score = score + prob.item()
                    all_candidates.append((new_seq, new_score, new_h, new_c))
            
            beams = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
            
            if all(seq[-1] == es_vocab['<eos>'] for seq, _, _, _ in beams):
                break
        
        best_seq = beams[0][0]
        trg_tokens = [es_vocab.get_itos()[idx] for idx in best_seq[1:-1]]
        return ' '.join(trg_tokens)

def quick_test(model, test_phrases):
    """Quick test on common phrases"""
    model.eval()
    print("\n" + "="*80)
    print("QUICK TRANSLATION TEST")
    print("="*80)
    
    for phrase in test_phrases:
        greedy = greedy_decode(model, phrase)
        beam = beam_search_decode(model, phrase, beam_width=5)
        print(f"\nEN: {phrase}")
        print(f"Greedy: {greedy}")
        print(f"Beam:   {beam}")
    print("="*80)

# ==================================================================================
# 9. MODEL INITIALIZATION AND TRAINING
# ==================================================================================

In [9]:
print("\n" + "="*80)
print("INITIALIZING MODELS")
print("="*80)

# Create models
encoder = ImprovedEncoder(len(en_vocab), EMBED_DIM, HIDDEN_DIM, dropout=DROPOUT)
decoder = Decoder(len(es_vocab), EMBED_DIM, HIDDEN_DIM, dropout=DROPOUT)
model_basic = Seq2Seq(encoder, decoder).to(device)

encoder_attn = ImprovedEncoder(len(en_vocab), EMBED_DIM, HIDDEN_DIM, dropout=DROPOUT)
attention = Attention(HIDDEN_DIM)
decoder_attn = DecoderWithAttention(len(es_vocab), EMBED_DIM, HIDDEN_DIM, attention, dropout=DROPOUT)
model_attention = Seq2SeqWithAttention(encoder_attn, decoder_attn).to(device)

# Setup training
criterion = nn.CrossEntropyLoss(ignore_index=es_vocab['<pad>'])

optimizer_basic = optim.Adam(model_basic.parameters(), lr=LEARNING_RATE)
optimizer_attn = optim.Adam(model_attention.parameters(), lr=LEARNING_RATE)

if USE_LR_SCHEDULER:
    scheduler_basic = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_basic, mode='min', factor=LR_SCHEDULER_FACTOR,
        patience=LR_SCHEDULER_PATIENCE, min_lr=LR_SCHEDULER_MIN_LR
    )
    scheduler_attn = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_attn, mode='min', factor=LR_SCHEDULER_FACTOR,
        patience=LR_SCHEDULER_PATIENCE, min_lr=LR_SCHEDULER_MIN_LR
    )

print(f"\nBasic Model Parameters: {sum(p.numel() for p in model_basic.parameters() if p.requires_grad):,}")
print(f"Attention Model Parameters: {sum(p.numel() for p in model_attention.parameters() if p.requires_grad):,}")

# Test phrases for monitoring
test_phrases = [
    "Hello",
    "Good morning",
    "I love you",
    "Thank you",
    "Where are you?"
]

print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)


INITIALIZING MODELS

Basic Model Parameters: 4,013,448
Attention Model Parameters: 4,734,856

STARTING TRAINING


In [11]:

# Training loop example:
history_attention = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_epoch(model_attention, train_loader, optimizer_attn, criterion, device)
    val_loss, val_acc = evaluate(model_attention, val_loader, criterion, device)
    
    if USE_LR_SCHEDULER:
        scheduler_attn.step(val_loss)
    
    history_attention['train_loss'].append(train_loss)
    history_attention['train_acc'].append(train_acc)
    history_attention['val_loss'].append(val_loss)
    history_attention['val_acc'].append(val_acc)
    
    print(f"Epoch {epoch+1}/{N_EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | PPL: {calculate_perplexity(train_loss):.2f}")
    print(f"  Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | PPL: {calculate_perplexity(val_loss):.2f}")
    
    # Quick test every 5 epochs
    if (epoch + 1) % 5 == 0:
        quick_test(model_attention, test_phrases)




Epoch 1/25
  Train Loss: 4.7146 | Train Acc: 0.2816 | PPL: 111.56
  Val Loss: 3.8072 | Val Acc: 0.3803 | PPL: 45.03
Epoch 2/25
  Train Loss: 3.4109 | Train Acc: 0.4197 | PPL: 30.29
  Val Loss: 3.0641 | Val Acc: 0.4651 | PPL: 21.42
Epoch 3/25
  Train Loss: 2.7846 | Train Acc: 0.4878 | PPL: 16.19
  Val Loss: 2.6429 | Val Acc: 0.5145 | PPL: 14.05
Epoch 4/25
  Train Loss: 2.3645 | Train Acc: 0.5373 | PPL: 10.64
  Val Loss: 2.3581 | Val Acc: 0.5515 | PPL: 10.57
Epoch 5/25
  Train Loss: 2.0475 | Train Acc: 0.5791 | PPL: 7.75
  Val Loss: 2.1652 | Val Acc: 0.5795 | PPL: 8.72

QUICK TRANSLATION TEST

EN: Hello
Greedy: se <unk>
Beam:   se <unk>

EN: Good morning
Greedy: buenas <unk>
Beam:   buenas <unk>

EN: I love you
Greedy: te quiero que te <unk>
Beam:   te quiero que <unk>

EN: Thank you
Greedy: gracias que te <unk>
Beam:   <unk> algo?

EN: Where are you?
Greedy: ¿dónde están <unk>
Beam:   ¿dónde están <unk>
Epoch 6/25
  Train Loss: 1.8035 | Train Acc: 0.6149 | PPL: 6.07
  Val Loss: 2.0257 |

: 