#Finetunning

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
from transformers import DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback

from sklearn.model_selection import train_test_split
import torch

# Load your dataset
data = pd.read_excel('')

train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['Type'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['Type'], random_state=42)

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model.config.gradient_checkpointing = True


args = Seq2SeqTrainingArguments(
    output_dir="all_mbart_English_unify_PUNCT",
    gradient_accumulation_steps=2,
    logging_steps=100,
    learning_rate=2e-5,
    num_train_epochs=6,
    logging_dir="/kaggle/working/logs",
    evaluation_strategy='steps',
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)


class DataFrameDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Return a tuple of strings
        return self.data.iloc[idx]["Indonesian statement"], self.data.iloc[idx]["English statement"]

class CustomDataCollator(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer, model):
        super().__init__(tokenizer, model=model)

    def __call__(self, batch):
        input_texts, target_texts = zip(*batch)
        return self.tokenizer.prepare_seq2seq_batch(src_texts=input_texts, tgt_texts=target_texts, padding='longest', max_length=100, return_tensors='pt')
data_collator = CustomDataCollator(tokenizer, model)

train_dataset = DataFrameDataset(train_data)
eval_dataset = DataFrameDataset(val_data)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

# Start training
trainer.train()

# Save the model
trainer.save_model("")

# Save the tokenizer
tokenizer.save_pretrained('')


2024-03-21 12:09:04.198030: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 12:09:04.198199: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 12:09:04.499225: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Step,Training Loss,Validation Loss
100,1.8085,0.181855
200,0.1209,0.088513
300,0.0576,0.057008
400,0.0328,0.04006
500,0.0215,0.0338
600,0.0142,0.027958
700,0.0114,0.024497
800,0.0083,0.022429
900,0.0063,0.020221
1000,0.0055,0.020172


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
`prepare_seq2seq_batch` is deprecated 

('/kaggle/working/all_mbart_English_unify_PUNCT/tokenizer_config.json',
 '/kaggle/working/all_mbart_English_unify_PUNCT/special_tokens_map.json',
 '/kaggle/working/all_mbart_English_unify_PUNCT/sentencepiece.bpe.model',
 '/kaggle/working/all_mbart_English_unify_PUNCT/added_tokens.json',
 '/kaggle/working/all_mbart_English_unify_PUNCT/tokenizer.json')

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
device = 'cuda'

tokenizer.src_lang = "fa_IR"

source = "امیلی یک مکانیک و جورج یک وکیل است. او با ماشین کار می کند."
encoded_source = tokenizer(source, return_tensors="pt").to(device)

generated_tokens = model.generate(
    **encoded_source,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=512
)

print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])


Emily is a mechanic and George is a lawyer. She works with cars.
