In [1]:
# Install required packages
%pip install transformers[sentencepiece]
%pip install datasets
%pip install evaluate
%pip install nltk
%pip install tqdm
%pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7685dbf6ba835d412447d04b1955904a5142726f0fb445c6f2177fb55c853e9c
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
import nltk
nltk.download('punkt')
import evaluate
import re
import pandas as pd
from tqdm.notebook import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Load in the English version
url = "https://www.gutenberg.org/cache/epub/46/pg46.txt"
response = requests.get(url)
text = response.text

# Define markers of the start and end of the story
start_marker = "STAVE I:  MARLEY'S GHOST"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL IN PROSE; BEING A GHOST STORY OF CHRISTMAS ***"

# Extract the story text
start_idx = text.find(start_marker)
end_idx = text.find(end_marker)

if start_idx != -1 and end_idx != -1:
    text = text[start_idx + len(start_marker):end_idx].strip()

# Load the Spanish references
spanish_url = "https://raw.githubusercontent.com/maddawg9838/dataset/refs/heads/main/The%20Christmas%20Carol%20Spanish.txt"
response = requests.get(spanish_url)
spanish_reference_text = response.text

In [4]:
# Split by paragraphs (paragraph = text separated by blank lines)
def split_paragraphs(text):
    # Split by two or more newlines
    paragraphs = re.split(r'\n\s*\n', text.strip())
    # Clean whitespace
    paragraphs = [p.replace("\n", " ").strip() for p in paragraphs if len(p.strip()) > 0]
    return paragraphs

english_paragraphs = split_paragraphs(text)
spanish_paragraphs = split_paragraphs(spanish_reference_text)

# Make sure we only translate as many paragraphs as we have references
num_paragraphs = min(len(english_paragraphs), len(spanish_paragraphs))
english_paragraphs = english_paragraphs[:num_paragraphs]
spanish_paragraphs = spanish_paragraphs[:num_paragraphs]

print(f"Number of paragraphs to translate: {num_paragraphs}")

Number of paragraphs to translate: 46


In [5]:
# Load translation model
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [6]:
def translate_text(text, max_length=512, num_beams=5, length_penalty=1.0):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams,
        length_penalty=length_penalty,
        early_stopping=True
    )
    translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translated[0]

In [7]:
# Hyperparameter tuning settings
num_beams_list = [3, 5, 7]
length_penalty_list = [0.8, 1.0, 1.2]

# Metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

results = []
best_bleu = -1
best_translation = None
best_params = None

# Start hyperparameter tuning
for num_beams in num_beams_list:
    for length_penalty in length_penalty_list:
        print(f"\nTranslating with num_beams={num_beams}, length_penalty={length_penalty}...")

        translated_paragraphs = []
        for paragraph in tqdm(english_paragraphs, desc="Translating paragraphs", unit="paragraph"):
            translated_paragraph = translate_text(
                paragraph,
                max_length=512,
                num_beams=num_beams,
                length_penalty=length_penalty
            )
            translated_paragraphs.append(translated_paragraph)

        # Compute BLEU (paragraph-wise)
        predictions_eval = translated_paragraphs
        references_eval = [[p] for p in spanish_paragraphs]  # list of lists
        bleu_score = bleu_metric.compute(
            predictions=predictions_eval,
            references=references_eval
        )["bleu"]

        # Compute ROUGE (flatten all paragraphs)
        predictions_flat = [" ".join(translated_paragraphs)]
        references_flat = [" ".join(spanish_paragraphs)]
        rouge_scores = rouge_metric.compute(
            predictions=predictions_flat,
            references=references_flat
        )

        # Store results
        results.append({
            "num_beams": num_beams,
            "length_penalty": length_penalty,
            "BLEU": bleu_score,
            "ROUGE-1": rouge_scores['rouge1'],
            "ROUGE-2": rouge_scores['rouge2'],
            "ROUGE-L": rouge_scores['rougeL']
        })

        # Track best
        if bleu_score > best_bleu:
            best_bleu = bleu_score
            best_translation = " ".join(translated_paragraphs)
            best_params = (num_beams, length_penalty)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Translating with num_beams=3, length_penalty=0.8...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=3, length_penalty=1.0...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=3, length_penalty=1.2...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=5, length_penalty=0.8...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=5, length_penalty=1.0...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=5, length_penalty=1.2...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=7, length_penalty=0.8...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=7, length_penalty=1.0...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]


Translating with num_beams=7, length_penalty=1.2...


Translating paragraphs:   0%|          | 0/46 [00:00<?, ?paragraph/s]

In [8]:
# Show results
df_results = pd.DataFrame(results).sort_values(by="BLEU", ascending=False)
print(df_results)
print(f"\nBest BLEU: {best_bleu} with num_beams={best_params[0]}, length_penalty={best_params[1]}")


   num_beams  length_penalty      BLEU   ROUGE-1   ROUGE-2   ROUGE-L
4          5             1.0  0.092672  0.572632  0.204072  0.323743
7          7             1.0  0.092116  0.572630  0.204206  0.320878
5          5             1.2  0.091352  0.571161  0.203414  0.322973
8          7             1.2  0.090702  0.573028  0.203081  0.320579
6          7             0.8  0.090655  0.571963  0.203699  0.320150
3          5             0.8  0.089356  0.571429  0.202300  0.321839
1          3             1.0  0.086751  0.572831  0.201637  0.319383
0          3             0.8  0.086389  0.572766  0.200843  0.318671
2          3             1.2  0.085380  0.572564  0.201076  0.319233

Best BLEU: 0.09267163598788083 with num_beams=5, length_penalty=1.0
