In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import load_dataset

In [2]:
model_name = "Helsinki-NLP/opus-mt-en-mul"

In [3]:
#device management 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device) # (, non_blocking=True)

device = get_default_device()
print(device)

cuda


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    inputs = [f">>por<< {ex}" for ex in example['en_US']]
    # inputs = [f">>fra<< {ex}" for ex in example['en_US']]
    # inputs = [f">>spa<< {ex}" for ex in example['en_US']]
    # inputs = [f">>deu<< {ex}" for ex in example['en_US']]

    targets = example['pt_PT'] 
    # targets = example['fr_FR'] 
    # targets = example['es_ES'] 
    # targets = example['de_DE']

    return tokenizer(
        inputs, 
        text_target=targets, 
        truncation=True, max_length=128
    )

class Dataset():
    def __init__(self):
        train_dataset = load_dataset("Amani27/massive_translation_dataset", split="train")
        valid_dataset = load_dataset("Amani27/massive_translation_dataset", split="validation")
        test_dataset  = load_dataset("Amani27/massive_translation_dataset", split="test")

        self.tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
        self.tokenized_valid_dataset = valid_dataset.map(tokenize_function, batched=True, remove_columns=valid_dataset.column_names)
        self.tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [6]:
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [7]:
# from huggingface_hub import notebook_login

# notebook_login()

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

dataset = Dataset()

args = Seq2SeqTrainingArguments(
    f"model_opus-mt-en-mul-finetuned",
    learning_rate=3e-4,
    weight_decay=0.1,
    num_train_epochs=3,
    
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="no",
    save_total_limit=0,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset.tokenized_train_dataset,
    eval_dataset=dataset.tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:
trainer.evaluate(max_length=128)

  0%|          | 0/255 [00:00<?, ?it/s]

{'eval_loss': 1.9913886785507202,
 'eval_bleu': 19.74986857742965,
 'eval_runtime': 201.7362,
 'eval_samples_per_second': 40.31,
 'eval_steps_per_second': 1.264}

In [10]:
trainer.train()

  0%|          | 0/4320 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 1.7231, 'grad_norm': 29.116931915283203, 'learning_rate': 0.0002654166666666666, 'epoch': 0.35}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 1.3859, 'grad_norm': 4.787890434265137, 'learning_rate': 0.00023076388888888886, 'epoch': 0.69}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 1.2045, 'grad_norm': 4.449230670928955, 'learning_rate': 0.00019604166666666666, 'epoch': 1.04}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 0.9552, 'grad_norm': 3.2791900634765625, 'learning_rate': 0.00016131944444444445, 'epoch': 1.39}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 0.9022, 'grad_norm': 2.469863176345825, 'learning_rate': 0.0001265972222222222, 'epoch': 1.74}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 0.7954, 'grad_norm': 3.01731276512146, 'learning_rate': 9.194444444444444e-05, 'epoch': 2.08}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 0.6085, 'grad_norm': 2.8664896488189697, 'learning_rate': 5.7222222222222213e-05, 'epoch': 2.43}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


{'loss': 0.5797, 'grad_norm': 2.5223610401153564, 'learning_rate': 2.2499999999999998e-05, 'epoch': 2.78}
{'train_runtime': 396.7222, 'train_samples_per_second': 348.274, 'train_steps_per_second': 10.889, 'train_loss': 0.9852957902131257, 'epoch': 3.0}


TrainOutput(global_step=4320, training_loss=0.9852957902131257, metrics={'train_runtime': 396.7222, 'train_samples_per_second': 348.274, 'train_steps_per_second': 10.889, 'train_loss': 0.9852957902131257, 'epoch': 3.0})

In [11]:
trainer.evaluate(max_length=128)

  0%|          | 0/255 [00:00<?, ?it/s]

{'eval_loss': 0.8769590258598328,
 'eval_bleu': 44.52320189794559,
 'eval_runtime': 204.1128,
 'eval_samples_per_second': 39.841,
 'eval_steps_per_second': 1.249,
 'epoch': 3.0}

In [12]:
import gradio as gr

def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    if torch.cuda.is_available():
        inputs = to_device(inputs, 'cuda')

    translated = model.generate(**inputs, max_length=128, num_return_sequences=1)
    decoded_translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return decoded_translation[0]

def chatbot(message, target_language):
    if target_language:
        translated_message = translate_text(">>" + language_options[target_language] + "<< " + message)
        return translated_message
    else:
        return "Please choose a language to translate!"

language_options = {
    "French" : "fra",
    "Spanish" : "spa", 
    "German" : "deu", 
    "Portuguese" : "por"
}

demo_chatbot = gr.Interface(chatbot, ["textbox", gr.Dropdown(list(language_options.keys()), value="Portuguese", label="Target Language")], "text", title="Multilingual Chatbot", description="Enter text in English and choose a target language for translation.")
demo_chatbot.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [13]:
# trainer.push_to_hub()

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


IMPORTANT: You are using gradio version 4.28.3, however version 4.29.0 is available, please upgrade.
--------


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Miguelcj1/model_opus-mt-en-mul-finetuned/commit/f98385d662ec9f0dc54712ab7df0850d6cfa6b5a', commit_message='End of training', commit_description='', oid='f98385d662ec9f0dc54712ab7df0850d6cfa6b5a', pr_url=None, pr_revision=None, pr_num=None)