In [1]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

# Translation

In [2]:
pip install transformers datasets evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: 

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from datasets import load_dataset

# Load the FLORES-200 dataset for Chinese to English translation
flores_dataset = load_dataset("facebook/flores", "zho_Hans-eng_Latn")
print(flores_dataset)

DatasetDict({
    dev: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_zho_Hans', 'sentence_eng_Latn'],
        num_rows: 997
    })
    devtest: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_zho_Hans', 'sentence_eng_Latn'],
        num_rows: 1012
    })
})


In [11]:
# Use the 'dev' split
dev_dataset = flores_dataset["dev"]

# Split the 'dev' set into training and testing if needed
dataset_split = dev_dataset.train_test_split(test_size=0.2)

# Access the train and test sets
train_dataset = dataset_split["train"]
test_dataset = dataset_split["test"]

print(f"Training examples: {len(train_dataset)}")
print(f"Test examples: {len(test_dataset)}")

Training examples: 797
Test examples: 200


# Preprocess

In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

# Load the FLORES-200 dataset for Chinese to English translation
dataset = load_dataset("facebook/flores", "zho_Hans-eng_Latn")

# Import and initialize the T5 tokenizer with a smaller model for quicker experimentation
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Set the source and target language codes for the dataset
source_lang = "sentence_zho_Hans"  # Column for Chinese (Simplified)
target_lang = "sentence_eng_Latn"   # Column for English (Latin script)
prefix = "translate Chinese to English: "  # Task-specific prefix

def preprocess_function(examples):
    # Prefix the input text with the task description and tokenize
    inputs = [prefix + text for text in examples[source_lang]]
    targets = [text for text in examples[target_lang]]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        max_length=128,  # Adjust based on your needs and model capacity
        truncation=True
    )

    return model_inputs

# Apply the preprocessing function to the entire dataset
# Only remove the columns that are unnecessary for training
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["id", "URL", "domain", "topic", "has_image", "has_hyperlink"])

# Create a data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Optional: You can save the tokenized dataset for future use
# tokenized_dataset.save_to_disk("path/to/save/tokenized_flores_dataset")

# Inspect the processed dataset (Optional)
print(tokenized_dataset["dev"][0])  # Check the first example in the dev split

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

{'sentence_zho_Hans': '周一，斯坦福大学医学院的科学家宣布，他们发明了一种可以将细胞按类型分类的新型诊断工具：一种可打印的微型芯片。这种芯片可以使用标准喷墨打印机制造，每片价格可能在一美分左右。', 'sentence_eng_Latn': 'On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.', 'input_ids': [13959, 2830, 12, 1566, 10, 3, 2, 6, 2, 6, 2, 10, 2, 6, 2, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [461, 2089, 6, 7004, 45, 8, 19796, 636, 1121, 13, 6852, 2162, 8, 8936, 13, 3, 9, 126, 7028, 1464, 24, 54, 1843, 2640, 57, 686, 10, 3, 9, 5103, 12354, 6591, 24, 54, 36, 9554, 338, 1068, 16, 157, 9245, 6454, 7, 21, 3673, 81, 80, 412, 5, 134, 5, 3151, 284, 5, 1]}


# Evaluate

In [13]:
import evaluate
import numpy as np
from transformers import AutoTokenizer

# Load the evaluation metric
metric = evaluate.load("sacrebleu")

# Initialize the tokenizer (ensure it's the same as used during training)
checkpoint = "t5-small"  # or another model you used
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def postprocess_text(preds, labels):
    """
    Post-process predictions and labels by stripping white spaces and ensuring
    that labels are in a list format to match the expected format for SacreBLEU evaluation.
    """
    # Strip white spaces and format labels
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    """
    Compute BLEU score using SacreBLEU metric and additional metrics for deeper insights.
    """
    preds, labels = eval_preds

    # Handle tuple predictions (common with Seq2Seq models)
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode the predictions and labels using the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # Ignore padding tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process the decoded predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Calculate SacreBLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # Additional evaluation metrics (optional, for deeper analysis)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result["exact_match"] = np.mean([pred == label[0] for pred, label in zip(decoded_preds, decoded_labels)])

    # Round the results for readability
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [14]:
print(dataset)

DatasetDict({
    dev: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_zho_Hans', 'sentence_eng_Latn'],
        num_rows: 997
    })
    devtest: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_zho_Hans', 'sentence_eng_Latn'],
        num_rows: 1012
    })
})


# Train

In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, AutoModelForSeq2SeqLM
import evaluate
import numpy as np

# Load the FLORES-200 dataset for Chinese to English translation
dataset = load_dataset("facebook/flores", "zho_Hans-eng_Latn")

# Initialize the tokenizer and model with the "t5-base" model
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Preprocess the dataset
source_lang = "sentence_zho_Hans"
target_lang = "sentence_eng_Latn"
prefix = "translate Chinese to English: "

def preprocess_function(examples):
    inputs = [prefix + text for text in examples[source_lang]]
    targets = [text for text in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# Apply the preprocessing function to the 'dev' and 'devtest' datasets
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["id", "URL", "domain", "topic", "has_image", "has_hyperlink"])

# Use only a small subset of the dataset for quick training
small_train_dataset = tokenized_dataset["dev"].shuffle(seed=42).select(range(50))  # Use 50 examples for quick training
small_eval_dataset = tokenized_dataset["devtest"].shuffle(seed=42).select(range(10))  # Use 10 examples for quick evaluation

# Create a data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Define the evaluation metric
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result["exact_match"] = np.mean([pred == label[0] for pred, label in zip(decoded_preds, decoded_labels)])
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Advanced Training Configuration with fewer epochs and smaller datasets
training_args = Seq2SeqTrainingArguments(
    output_dir="flores_t5_base_model",
    evaluation_strategy="steps",
    eval_steps=100,  # Less frequent evaluation for quicker training
    learning_rate=3e-5,
    per_device_train_batch_size=4,  # Smaller batch size for quicker updates
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,  # Only keep the last checkpoint
    num_train_epochs=1,  # Reduce to 1 epoch for quick iteration
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,  # Frequent logging to monitor progress
    save_steps=200,  # Save less frequently
    gradient_accumulation_steps=2,  # Adjust if using very small batch sizes
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    push_to_hub=False,  # Disable push to hub for quick iteration
)

# Trainer instantiation with early stopping callback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  # Use the smaller subset
    eval_dataset=small_eval_dataset,  # Use the smaller subset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training the model with reduced steps and epochs
trainer.train()

# Optionally evaluate on the 'devtest' split after training
final_eval_metrics = trainer.evaluate(eval_dataset=small_eval_dataset)
print(f"Final evaluation metrics on small devtest: {final_eval_metrics}")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss




Final evaluation metrics on small devtest: {'eval_loss': 3.9038708209991455, 'eval_bleu': 0.0, 'eval_gen_len': 6.5, 'eval_exact_match': 0.0, 'eval_runtime': 1.3837, 'eval_samples_per_second': 7.227, 'eval_steps_per_second': 2.168, 'epoch': 0.9230769230769231}


# Inference

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time

# Load the tokenizer and model
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Ensure the model is in evaluation mode
model.eval()

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Text to be translated
text = "translate Chinese to English: 你的老师是一个很棒的人。"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt").to(device)

# Advanced Generation Parameters (Beam Search)
generate_kwargs = {
    "max_length": 60,
    "num_beams": 5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
}

# Measure inference time for performance profiling
start_time = time.time()

# Generate the translation
with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        **generate_kwargs
    )

end_time = time.time()
print(f"Inference Time: {end_time - start_time:.2f} seconds")

# Decode the generated token IDs back into a string
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Advanced Post-Processing for Translation
def advanced_postprocess(translation):
    translation = translation.replace(" .", ".")
    translation = translation.replace(" ,", ",")
    translation = translation.replace(" ?", "?")
    return translation

# Apply advanced post-processing
final_translation = advanced_postprocess(translation)

# Output the final translated text
print(f"Translated Text: {final_translation}")

Inference Time: 0.45 seconds
Translated Text: 


# Faster Inference

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time

# Load the tokenizer and model
checkpoint = "t5-small"  # Use a smaller model for faster inference
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Ensure the model is in evaluation mode
model.eval()

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Text to be translated
text = "translate Chinese to English: 你的老师是一个很棒的人。"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt").to(device)

# Optimized Generation Parameters
generate_kwargs = {
    "max_length": 40,  # Reduce the maximum length for faster inference
    "num_beams": 1,  # Use greedy decoding for the fastest inference
    "early_stopping": True,
}

# Measure inference time for performance profiling
start_time = time.time()

# Generate the translation
with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        **generate_kwargs
    )

end_time = time.time()
print(f"Inference Time: {end_time - start_time:.2f} seconds")

# Decode the generated token IDs back into a string
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Output the final translated text
print(f"Translated Text: {translation}")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Inference Time: 0.04 seconds
Translated Text: 




# Deploy from Colab to Hugging Face

In [24]:
!pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM

# Push the model to Hugging Face Hub
model.push_to_hub("meta-flores-translation-chinese-english-model")
tokenizer.push_to_hub("meta-flores-translation-chinese-english-model")

trainer.push_to_hub()


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NeuraFusionAI/flores_t5_base_model/commit/c8eb3fe8e27262d12485562a45cf16a31fa01be0', commit_message='End of training', commit_description='', oid='c8eb3fe8e27262d12485562a45cf16a31fa01be0', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, AutoModelForSeq2SeqLM
import evaluate
import numpy as np

# Load the FLORES-200 dataset for Chinese to English translation
dataset = load_dataset("facebook/flores", "zho_Hans-eng_Latn")

# Initialize the tokenizer and model with the "t5-base" model
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Preprocess the dataset
source_lang = "sentence_zho_Hans"
target_lang = "sentence_eng_Latn"
prefix = "translate Chinese to English: "

def preprocess_function(examples):
    inputs = [prefix + text for text in examples[source_lang]]
    targets = [text for text in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# Apply the preprocessing function to the 'dev' and 'devtest' datasets
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["id", "URL", "domain", "topic", "has_image", "has_hyperlink"])

# Define train and evaluation datasets
train_dataset = tokenized_dataset["dev"].shuffle(seed=42).select(range(50))  # Use 50 examples for quick training
eval_dataset = tokenized_dataset["devtest"].shuffle(seed=42).select(range(10))  # Use 10 examples for quick evaluation

# Create a data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Define the evaluation metric
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result["exact_match"] = np.mean([pred == label[0] for pred, label in zip(decoded_preds, decoded_labels)])
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Updated Training Configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="meta-flores-translation-chinese-english-model",
    eval_strategy="epoch",  # Updated to use the correct parameter
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,  # Enable push to hub
)

# Trainer instantiation with early stopping callback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Ensure eval_dataset is defined and passed here
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training the model
trainer.train()

# Optionally evaluate on the 'devtest' split after training
final_eval_metrics = trainer.evaluate(eval_dataset=eval_dataset)
print(f"Final evaluation metrics on small devtest: {final_eval_metrics}")

# Push the model to Hugging Face Hub
trainer.push_to_hub()

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Gen Len,Exact Match
1,No log,3.932323,0.0,6.5,0.0




events.out.tfevents.1723885081.aea86c2598c7.204.3:   0%|          | 0.00/505 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NeuraFusionAI/meta-flores-translation-chinese-english-model/commit/ef023d05a51336ca8784e1344637d4015e98fdfd', commit_message='End of training', commit_description='', oid='ef023d05a51336ca8784e1344637d4015e98fdfd', pr_url=None, pr_revision=None, pr_num=None)