In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration
from datasets import load_dataset
import torch.nn as nn
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

## Load the dataset

In [4]:
!pip install --upgrade datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [5]:
dataset = load_dataset('Helsinki-NLP/opus-100', 'en-es')
print(dataset['train'][0])

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/237k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/238k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'translation': {'en': "It was the asbestos in here, that's what did it!", 'es': 'Fueron los asbestos aquí. ¡Eso es lo que ocurrió!'}}


## Data Preprocessing

In [6]:
def preprocess_data(examples):
    inputs = [f'Translate from English to Spanish: {example["en"]}' for example in examples['translation']]
    targets = [example['es'] for example in examples['translation']]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing to the dataset
train_dataset = dataset['train'].select(range(30000)).map(preprocess_data, batched=True)
test_dataset = dataset['test'].map(preprocess_data, batched=True)

print("Sample preprocessed data:")
print(train_dataset[0])

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Sample preprocessed data:
{'translation': {'en': "It was the asbestos in here, that's what did it!", 'es': 'Fueron los asbestos aquí. ¡Eso es lo que ocurrió!'}, 'input_ids': [30355, 15, 45, 1566, 12, 5093, 10, 94, 47, 8, 23778, 16, 270, 6, 24, 31, 7, 125, 410, 34, 55, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labe

## PyTorch data loader

In [7]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(item['labels'], dtype=torch.long)
        }

# Create PyTorch datasets
train_torch_dataset = TranslationDataset(train_dataset)
test_torch_dataset = TranslationDataset(test_dataset)

# Create DataLoaders
train_dataloader = DataLoader(train_torch_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_torch_dataset, batch_size=8, shuffle=False)

## Freeze the Model

In [8]:
print("Model structure:")
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters before freezing: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Model structure:
shared.weight: torch.Size([32128, 768])
encoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.k.weight: torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.v.weight: torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.o.weight: torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: torch.Size([32, 12])
encoder.block.0.layer.0.layer_norm.weight: torch.Size([768])
encoder.block.0.layer.1.DenseReluDense.wi_0.weight: torch.Size([2048, 768])
encoder.block.0.layer.1.DenseReluDense.wi_1.weight: torch.Size([2048, 768])
encoder.block.0.layer.1.DenseReluDense.wo.weight: torch.Size([768, 2048])
encoder.block.0.layer.1.layer_norm.weight: torch.Size([768])
encoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([768, 768])
encoder.block.1.layer.0.SelfAttention.k.weight: torch.Size([768, 768])
encoder.block.1.layer.0.SelfAttention.v.weight: torch.Size([768, 768])
encoder

In [9]:
for name, param in model.named_parameters():
    if any(layer in name for layer in ['shared', 'encoder', 'decoder']):
        # Only freeze the main transformer blocks, keep the final output layers trainable
        if 'lm_head' not in name and 'final_layer_norm' not in name:
            param.requires_grad = False

In [10]:
print(f"Trainable parameters after freezing: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Trainable parameters after freezing: 24,675,840


### Key Tips for Transfer Learning
**Freezing the Pretrained Model Layers**: When doing transfer learning, it's often helpful to freeze the main language model layers (like the encoder and decoder). This keeps the model from forgetting what it already knows and helps prevent overfitting. Instead of re-learning everything, the model can focus on learning just the new task.

**Using from_logits=True in the Loss Function**: If you're using a Hugging Face model, make sure to set `from_logits=True` in your loss function. These models output raw values (logits) — not probabilities — so this setting tells the loss function to handle them correctly. Without it, your loss values may be wrong.

## Model Training

In [12]:
# Prepare optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training function
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training")

    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Replace padding token ids in labels with -100 so they are ignored by loss
        labels[labels == tokenizer.pad_token_id] = -100

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(dataloader)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Replace padding token ids in labels with -100
            labels[labels == tokenizer.pad_token_id] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
            progress_bar.set_postfix({'loss': outputs.loss.item()})

    return total_loss / len(dataloader)

In [13]:
# Train the model
epochs = 3
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Training
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training Loss: {train_loss:.4f}")

    # Evaluation
    eval_loss = evaluate(model, test_dataloader, device)
    print(f"Validation Loss: {eval_loss:.4f}")


Epoch 1/3


Training:   0%|          | 0/3750 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training: 100%|██████████| 3750/3750 [18:58<00:00,  3.29it/s, loss=2.15]


Training Loss: 1.7241


Evaluating: 100%|██████████| 250/250 [00:43<00:00,  5.80it/s, loss=1.58]


Validation Loss: 1.3596

Epoch 2/3


Training: 100%|██████████| 3750/3750 [19:01<00:00,  3.29it/s, loss=1.58]


Training Loss: 1.6907


Evaluating: 100%|██████████| 250/250 [00:43<00:00,  5.80it/s, loss=1.58]


Validation Loss: 1.3530

Epoch 3/3


Training: 100%|██████████| 3750/3750 [19:00<00:00,  3.29it/s, loss=1.37]


Training Loss: 1.6776


Evaluating: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s, loss=1.57]

Validation Loss: 1.3490





## Inference translation

In [14]:
def translate_text(model, tokenizer, text, device, max_length=128):
    """Translate a single text from English to Spanish"""
    model.eval()

    # Prepare input
    input_text = f"Translate from English to Spanish: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_length,
                      truncation=True, padding=True).to(device)

    # Generate translation
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            do_sample=False
        )

    # Decode the output
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated

In [15]:
# Test translation on a few examples
print("\n" + "="*50)
print("TRANSLATION EXAMPLES")
print("="*50)

# Get some test examples
test_examples = [
    "Hello, how are you?",
    "I love learning new languages.",
    "The weather is beautiful today.",
    "Can you help me with this problem?",
    "Thank you for your assistance."
]

for text in test_examples:
    translation = translate_text(model, tokenizer, text, device)
    print(f"English: {text}")
    print(f"Spanish: {translation}")
    print("-" * 40)

# Test on actual dataset examples
print("\nTesting on dataset examples:")
print("="*50)

for i, batch in enumerate(test_dataloader):
    if i >= 3:  # Only show first 3 batches
        break

    # Get the first example from the batch
    input_ids = batch['input_ids'][0]
    labels = batch['labels'][0]

    # Decode original input and reference
    original_input = tokenizer.decode(input_ids, skip_special_tokens=True)
    reference = tokenizer.decode(labels[labels != -100], skip_special_tokens=True)

    # Extract just the English text (remove the prompt)
    english_text = original_input.replace("Translate from English to Spanish: ", "")

    # Generate translation
    translation = translate_text(model, tokenizer, english_text, device)

    print(f"English: {english_text}")
    print(f"Reference: {reference}")
    print(f"Generated: {translation}")
    print("-" * 40)


TRANSLATION EXAMPLES
English: Hello, how are you?
Spanish: Hola, cómo estás?
----------------------------------------
English: I love learning new languages.
Spanish: Lo encanta aprender nuevas idiomas.
----------------------------------------
English: The weather is beautiful today.
Spanish: El trabajo es hermoso hoy.
----------------------------------------
English: Can you help me with this problem?
Spanish: Puedes ayudarme a esta problema?
----------------------------------------
English: Thank you for your assistance.
Spanish: Gracias por su asistencia.
----------------------------------------

Testing on dataset examples:
English: If your country produced ODS for this purpose, please enter the amount so produced in column 6 on Data Form 3.”
Reference: Si su pas produjo SAO para estos usos, srvase anotar en la columna 6 del formulario de datos 3 la cantidad correspondiente”.
Generated: Si su pas ha producido ODS para esta propósito, entrará la cantidad sobre la columna 6 en la fo

# Evaluation with ROUGE and BLEU

In [16]:
!pip install rouge-score nltk

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=3d5bba7ad9736dc931d6eeb78cd4daa09bf5769f04d26a8bdaeffec896b3e5c4
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [22]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Calculating ROUGE and BLEU Scores

In [23]:
def translate(batch):
    """Translate function compatible with dataset batches"""
    model.eval()

    # Get the first example from the batch
    input_ids = batch["input_ids"][0:1].to(device)  # Take first example and keep batch dimension
    attention_mask = batch["attention_mask"][0:1].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to calculate ROUGE and BLEU scores
def calculate_scores(reference, hypothesis):
    # Initialize scorers
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bleu_smoothing = SmoothingFunction().method4

    # Calculate ROUGE scores
    rouge_scores = rouge.score(reference, hypothesis)

    # Calculate BLEU score
    reference_tokens = [nltk.word_tokenize(reference)]
    hypothesis_tokens = nltk.word_tokenize(hypothesis)
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=bleu_smoothing)

    return rouge_scores, bleu_score

In [24]:
print("\n" + "="*50)
print("EVALUATION WITH ROUGE AND BLEU SCORES")
print("="*50)

# Evaluate translations and calculate scores
for i, batch in enumerate(test_dataloader):
    if i >= 5:  # Evaluate first 5 batches
        break

    # Generate translation
    translated_text = translate(batch)

    # Get reference translation
    labels = batch["labels"][0]  # First example in batch
    reference_text = tokenizer.decode(labels[labels != -100], skip_special_tokens=True)

    # Get original English text
    input_text = tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True)
    english_text = input_text.replace("Translate from English to Spanish: ", "")

    # Calculate scores
    rouge_scores, bleu_score = calculate_scores(reference_text, translated_text)

    print(f"Example {i+1}:")
    print(f"English: {english_text}")
    print(f"Reference: {reference_text}")
    print(f"Translation: {translated_text}")
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.3f}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.3f}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.3f}")
    print(f"BLEU Score: {bleu_score:.3f}")
    print("-" * 50)


EVALUATION WITH ROUGE AND BLEU SCORES
Example 1:
English: If your country produced ODS for this purpose, please enter the amount so produced in column 6 on Data Form 3.”
Reference: Si su pas produjo SAO para estos usos, srvase anotar en la columna 6 del formulario de datos 3 la cantidad correspondiente”.
Translation: Si su pas ha producido ODS para esta propósito, entrará la cantidad sobre la columna 6 en la formularia de datos 3".
ROUGE-1: 0.578
ROUGE-2: 0.372
ROUGE-L: 0.444
BLEU Score: 0.140
--------------------------------------------------
Example 2:
English: So you would never question.
Reference: Para que nunca cuestionases nada.
Translation: As que nunca habra preguntado.
ROUGE-1: 0.400
ROUGE-2: 0.250
ROUGE-L: 0.400
BLEU Score: 0.108
--------------------------------------------------
Example 3:
English: The results of the project are clear.
Reference: Los resultados del proyecto son evidentes.
Translation: Los resultados del proyecto son claros.
ROUGE-1: 0.833
ROUGE-2: 0.800
RO