In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "Iker/Document-Translation-en-es"

summary_model_name = "t5-small"
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
revision = "main"

tokenizer_model_name = "distilbert-base-uncased"
model_name = "Helsinki-NLP/opus-mt-en-es"

device = 0 if torch.cuda.is_available() else -1

## Base summariser

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(summary_model_name)
tokenizer = AutoTokenizer.from_pretrained(summary_model_name)
text = """
The Hugging Face Transformers library provides state-of-the-art general-purpose architectures
for natural language understanding (NLU) and natural language generation (NLG). These architectures
include BERT, GPT, GPT-2, BART, and T5, which can be applied to text classification, summarization, translation, and more.
"""

inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)

the Hugging Face Transformers library provides state-of-the-art architectures for natural language understanding (NLU) and natural language generation (NLG). these architectures include BERT, GPT, GPT-2, and BART.


In [1]:
import sys
sys.path.append("../")

from fine_tuning.fine_tuning_utils import load_and_train_test_split_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train, test = load_and_train_test_split_dataset("Iker/Document-Translation-en-es")

Map: 100%|██████████| 10533/10533 [00:00<00:00, 35686.36 examples/s]


In [3]:
train[0]

{'es': 'Mañana tendrá lugar en ses Voles de Palma la Diada per la Llengua, que organiza la Obra Cultural Balear (OCB), que empezará a las 17.30 horas y que se prolongará hasta la noche con grupos de castellers y música en catalán. El presidente de la Obra Cultural Balear, Jaume Mateu, presentó ayer en su sede de Can Alcover, acompañado de la vicepresidenta Marisa Cerdó, los actos que formarán parte de está jornada que se celebrará bajo el lema \'De cada día, una Diada\', que empezará con el grupo Cucorba y que será presentada por la actriz Paula Company. A las 18.30 llegará el turno de los grupos de castellers Al·lots de Llevant i Castellers de Mallorca, a las 19.15, tendrá lugar el acto central con el discurso de Mateu, que irá precedido por parlamentos de los miembros de Tots Plegats. Después actuarán Maria Rosselló y los Botifanfarrons y sobre las 21.30 concluirá la fiesta Música Nostra. "Conviene que el sábado a ses Voltes acudamos todos los que queremos Mallorca, que estemos para 

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("SEBIS/legal_t5_small_trans_en_es_small_finetuned")
tokenizer = T5Tokenizer.from_pretrained("SEBIS/legal_t5_small_trans_en_es_small_finetuned")

# Input text to translate
text = "The cat was hungry."

# Use the correct task prefix for translation
input_text = "translate English to German: " + text

# Tokenize the input with the correct prefix
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate the translation
translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

# Decode the generated tokens to get the translation
translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

# Print the translation
print(f"Translation: {translation}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Translation: El gato padecía hambre.


## Baseline translator

In [11]:
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
model = MarianMTModel.from_pretrained(translation_model_name)
english_text = "What color is the hrass? Answer in english"
input_ids = tokenizer.encode(english_text, return_tensors="pt")
translated_ids = model.generate(input_ids)

translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

print("Translated Text (Spanish):", translated_text)

Translated Text (Spanish): ¿De qué color es la hrass? Respuesta en inglés


## Load dataset

In [12]:
dataset = load_dataset(dataset_name)
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"].shuffle(seed=42).select(range(100))
test_dataset = split_dataset["test"].shuffle(seed=42).select(range(100))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## x/y split

In [13]:
train_x_data = train_dataset["en"]
train_y_data = train_dataset["es"]
test_x_data = test_dataset["en"]
test_y_data = test_dataset["es"]

In [16]:
len(train_x_data[0])

3100

# Load Pre-Trained Models

In [7]:
model = MarianMTModel.from_pretrained(model_name)
bleu = evaluate.load("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)

## Load tokenizer

In [8]:
# I used the original text (english) to tokenize the data, seemed logical to me, but idk
def tokenize_function(examples):
    return tokenizer(examples["en"], padding="max_length", truncation=True)

train_x_data = train_dataset.map(tokenize_function, batched=True)
test_x_data = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 100/100 [00:00<00:00, 458.86 examples/s]


# Evaluate Model

In [9]:
# Okay this maybe works, maybe not, who knows, help me?
input_ids = torch.tensor(test_x_data["input_ids"]).to(model.device)
print('translating the tokens..')
translated_tokens = model.generate(input_ids=input_ids)
predicted_translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

references = [[ref] for ref in test_y_data] #The bleu score needs a list of lists
bleu_score = bleu.compute(predictions=predicted_translations, references=references)

print(f"BLEU score: {bleu_score['score']}")

getting input ids..
translating the tokens..


KeyboardInterrupt: 