In [1]:
!pip install -q transformers datasets torch scipy scikit-learn accelerate evaluate nltk rouge_score sentencepiece sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    Seq2SeqTrainingArguments
)
import torch
from torch.utils.data import Dataset
import random

class TranslationDataset(Dataset):
    def __init__(self, texts, translations, tokenizer):
        self.inputs = tokenizer(
            ["translate English to French: " + text for text in texts],
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        self.targets = tokenizer(
            translations,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.targets["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }

def train_translator():
    # Load model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()

    # Load dataset
    dataset = load_dataset("opus_books", "en-fr", split="train[:10%]", trust_remote_code=True)


    # Take share of the loaded data if needed
    total_examples = len(dataset)
    subsample_size = total_examples

    # Randomly sample indices
    all_indices = list(range(total_examples))
    selected_indices = random.sample(all_indices, subsample_size)

    # Get subsampled data
    texts = [dataset[i]["translation"]["en"] for i in selected_indices]
    translations = [dataset[i]["translation"]["fr"] for i in selected_indices]

    # Create dataset
    train_size = int(len(texts) * 0.8)
    train_dataset = TranslationDataset(texts[:train_size], translations[:train_size], tokenizer)
    eval_dataset = TranslationDataset(texts[train_size:], translations[train_size:], tokenizer)

    # Training configuration
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        report_to="tensorboard",
        learning_rate=1e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=1,
        predict_with_generate=True,
        logging_dir="./logs",
        logging_steps=5,
        push_to_hub=False,
        save_strategy="epoch"
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()
    trainer.save_model("./translator")
    return model, tokenizer

def translate_text(text, model, tokenizer):
    inputs = tokenizer("translate English to French: " + text, return_tensors="pt", max_length=512, truncation=True)
    # Move inputs to CUDA
    inputs = {k: v.cuda() for k, v in inputs.items()}

    translation_ids = model.generate(
        inputs["input_ids"],
        max_length=512,
        min_length=10,
        num_beams=4,
        length_penalty=0.6,
        early_stopping=True
    )
    return tokenizer.decode(translation_ids[0], skip_special_tokens=True)

def main():
    # Train model
    model, tokenizer = train_translator()

    # Test model
    test_text = """
    The artificial intelligence has revolutionized many aspects of our daily lives,
    bringing innovations in various fields such as medicine and education.
    """

    translation = translate_text(test_text, model, tokenizer)
    print("\nOriginal text:", test_text)
    print("\nFrench translation:", translation)

In [3]:
main()

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1898,0.1671



Original text: 
    The artificial intelligence has revolutionized many aspects of our daily lives,
    bringing innovations in various fields such as medicine and education.
    

French translation: L’intelligence artificielle a révolutionné de nombreux aspects de notre vie quotidienne, apportant des innovations dans divers domaines comme la médecine et l’éducation.
