In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import load_dataset

In [2]:
model_name = "Helsinki-NLP/opus-mt-tc-big-en-pt"

In [3]:
#device management 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device) # (, non_blocking=True)

device = get_default_device()
print(device)

cuda


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function_en_pt(example):
    inputs = [f">>por<< {ex}" for ex in example['en_US']] # o prefixo ">>por<<" indica a task do modelo (neste caso traduzir para português)
    targets = example['pt_PT']

    return tokenizer(
        inputs, 
        text_target=targets, 
        truncation=True, max_length=128
    )

class Dataset():
    def __init__(self):
        train_dataset = load_dataset("Amani27/massive_translation_dataset", split="train")
        valid_dataset = load_dataset("Amani27/massive_translation_dataset", split="validation")
        test_dataset  = load_dataset("Amani27/massive_translation_dataset", split="test")

        self.tokenized_train_dataset = train_dataset.map(tokenize_function_en_pt, batched=True, remove_columns=train_dataset.column_names)
        self.tokenized_valid_dataset = valid_dataset.map(tokenize_function_en_pt, batched=True, remove_columns=valid_dataset.column_names)
        self.tokenized_test_dataset = test_dataset.map(tokenize_function_en_pt, batched=True, remove_columns=test_dataset.column_names)

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [6]:
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [7]:
# from huggingface_hub import notebook_login

# notebook_login()

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

dataset = Dataset()

args = Seq2SeqTrainingArguments(
    f"model_opus-mt-tc-big-en-pt-finetuned",
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="no",
    save_total_limit=0,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset.tokenized_train_dataset,
    eval_dataset=dataset.tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:
trainer.evaluate(max_length=128)

  0%|          | 0/64 [00:00<?, ?it/s]

{'eval_loss': 1.8328752517700195,
 'eval_bleu': 27.507837365663647,
 'eval_runtime': 41.3665,
 'eval_samples_per_second': 49.146,
 'eval_steps_per_second': 1.547}

In [10]:
trainer.train()

  0%|          | 0/1080 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54775]], 'forced_eos_token_id': 44670}


{'loss': 1.0606, 'grad_norm': 2.9432311058044434, 'learning_rate': 1.613888888888889e-05, 'epoch': 1.39}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54775]], 'forced_eos_token_id': 44670}


{'loss': 0.723, 'grad_norm': 2.8999171257019043, 'learning_rate': 2.25e-06, 'epoch': 2.78}
{'train_runtime': 184.6057, 'train_samples_per_second': 187.112, 'train_steps_per_second': 5.85, 'train_loss': 0.8749954612166794, 'epoch': 3.0}


TrainOutput(global_step=1080, training_loss=0.8749954612166794, metrics={'train_runtime': 184.6057, 'train_samples_per_second': 187.112, 'train_steps_per_second': 5.85, 'train_loss': 0.8749954612166794, 'epoch': 3.0})

In [11]:
trainer.evaluate(max_length=128)

  0%|          | 0/64 [00:00<?, ?it/s]

{'eval_loss': 0.9480153918266296,
 'eval_bleu': 47.11181030666177,
 'eval_runtime': 39.4964,
 'eval_samples_per_second': 51.473,
 'eval_steps_per_second': 1.62,
 'epoch': 3.0}

In [12]:
import gradio as gr

def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    if torch.cuda.is_available():
        inputs = to_device(inputs, 'cuda')
        
    translated = model.generate(**inputs, max_length=128, num_return_sequences=1)
    decoded_translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return decoded_translation[0]

def chatbot(message):
    translated_message = translate_text(">>por<< " + message)
    return translated_message

demo_chatbot = gr.Interface(chatbot, "textbox", "text", title="Simple Chatbot", description="Enter text in English.")
demo_chatbot.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.28.3, however version 4.29.0 is available, please upgrade.
--------


In [14]:
# trainer.push_to_hub()

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54775]], 'forced_eos_token_id': 44670}


training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/930M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Miguelcj1/model_opus-mt-tc-big-en-pt-finetuned/commit/f9627da6ec2bf257de55d559fb766c0500cb7d02', commit_message='End of training', commit_description='', oid='f9627da6ec2bf257de55d559fb766c0500cb7d02', pr_url=None, pr_revision=None, pr_num=None)