In [None]:
pip install -q trl evaluate sacrebleu

In [None]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import yaml
from datasets import load_from_disk
from transformers import TrainingArguments
from trl import SFTTrainer
import wandb
import numpy as np
from datasets import Dataset
from datasets import concatenate_datasets
import pandas as pd
import seaborn as sns
from datasets import load_dataset, concatenate_datasets
from transformers import DataCollatorForSeq2Seq
import evaluate

def load_model_and_tokenizer(model_name,tokenizer_name,device_map:str='auto'):

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    model.config.use_cache = False

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,trust_remote_code=True)

    tokenizer.padding_side='right'

    return model, tokenizer

In [None]:
model_name = 'google/mt5-small'
model, tokenizer = load_model_and_tokenizer(model_name,model_name)

In [None]:
train = pd.read_csv("/kaggle/input/melio-dataset/final_train_df.csv")
test = pd.read_csv("/kaggle/input/melio-dataset/final_val_df.csv")
train.head()

In [None]:
tokenizer.decode(tokenizer.encode('Å'))

In [None]:
template = """
translate from Dyula to French: {dyu}
"""

train["prompt"] = train.apply(lambda row: template.format(dyu=row['dyu'],
                                                             fr=row['fr']),
                                 axis=1)


In [None]:
from IPython.display import Markdown
Markdown(train["prompt"].iloc[3])

In [None]:
from datasets import Dataset

train_ds_raw = Dataset.from_pandas(train, split="train")
train_ds_raw

In [None]:
tokenized_source_training = train_ds_raw.map(
    lambda x: tokenizer(x["prompt"], truncation=True), 
    batched=True, remove_columns=['fr', 'dyu', 'prompt'])

source_lengths_training = [len(x) for x in tokenized_source_training["input_ids"]]

print(f"Max source length: {max(source_lengths_training)}")
print(f"95% source length: {int(np.percentile(source_lengths_training, 95))}")

In [None]:
tokenized_target_training = train_ds_raw.map(
    lambda x: tokenizer(x["fr"], truncation=True), 
    batched=True, remove_columns=['fr', 'dyu', 'prompt'])
target_lengths_training = [len(x) for x in tokenized_target_training["input_ids"]]

print(f"Max target length: {max(target_lengths_training)}")
print(f"95% target length: {int(np.percentile(target_lengths_training, 95))}")

In [None]:
max_source_length = max(source_lengths_training)
max_source_length

In [None]:
max_target_length = max(target_lengths_training)
max_target_length

In [None]:
def preprocess_function(sample, padding="max_length"):

    model_inputs = tokenizer(sample["prompt"], max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["fr"], max_length=max_target_length, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train_ds = train_ds_raw.map(
    preprocess_function, batched=True, 
    remove_columns=['fr', 'dyu', 'prompt'])

In [None]:
tokenized_train_ds

In [None]:
ds_dict = tokenized_train_ds.train_test_split(test_size=0.1)

In [None]:
trainset = ds_dict["train"]
trainset   

In [None]:
testset = ds_dict["test"]
testset

In [None]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

In [None]:
# Metric
metric = evaluate.load("sacrebleu")

In [None]:
# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in the labels as we can't decode them.
    # for some reason, also get a lot of -100 in preds
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 8
path = "dyu_to_fr_model"
training_args = Seq2SeqTrainingArguments(
    output_dir=path,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    generation_max_length=273,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    #bf16=True,
    # logging & evaluation strategies
   # logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
LOCAL_SAVE_DIR = "dyu_to_fr_model"

In [None]:
trainer.tokenizer.save_pretrained(LOCAL_SAVE_DIR)
trainer.model.save_pretrained(LOCAL_SAVE_DIR)

In [None]:
text = train['dyu'].iloc[10]
text

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

model = AutoModelForSeq2SeqLM.from_pretrained(LOCAL_SAVE_DIR)
tokenizer = AutoTokenizer.from_pretrained(LOCAL_SAVE_DIR)

inputs = tokenizer(prefix + text, max_length=tokenizer.model_max_length, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_new_tokens=40, do_sample=True, top_k=20, top_p=0.7)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

In [None]:
train['fr'].iloc[10]