# Import libraries

In [1]:
!pip install datasets
!pip install evaluate rouge_score
!pip install accelerate



In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import numpy as np
import pandas as pd
import accelerate

2024-04-08 02:10:51.779665: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-08 02:10:51.779721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-08 02:10:51.781220: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Model

In [3]:
model_path ="VietAI/vit5-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model = model, return_tensors = "pt")

  return self.fget.__get__(instance, owner)()


In [4]:
dataset_path = "/kaggle/input/dataset/bio_medicine.csv"
df = pd.read_csv(dataset_path)
dataset = Dataset.from_pandas(df)

In [5]:
trainValid, test = dataset.train_test_split(test_size = 0.95).values()
train, valid = trainValid.train_test_split(test_size = 0.125).values()
dataset = DatasetDict({"train" : train, "valid" : valid, "test" : test})

In [6]:
def data_processing(dataset):
    document = dataset["Document"]
    summary = dataset["Summary"]
    doc_tokenizer = tokenizer(document, max_length = 1024, truncation = True)
    sum_tokenizer = tokenizer(summary, max_length = 1024, truncation = True)
    input_ids = doc_tokenizer["input_ids"]
    attention_mask = doc_tokenizer["attention_mask"]
    labels = sum_tokenizer["input_ids"]
    dataset = DatasetDict({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
    return dataset

In [7]:
dataset = dataset.map(data_processing, remove_columns=["Summary", "Document", "Dataset"], batched = True, batch_size = 512)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

# Training

In [8]:
metric = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions[predictions == -100] = 1
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.eos_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [9]:
args = Seq2SeqTrainingArguments(output_dir = "viet",
                                evaluation_strategy="epoch",
                                save_strategy="epoch",
                               per_device_train_batch_size=2,
                               per_device_eval_batch_size=2,
                               learning_rate=1e-4,
                               weight_decay=1e-2,
                               load_best_model_at_end =True,
                               predict_with_generate=True,
                               num_train_epochs=5,
                               logging_strategy="epoch",
                               generation_max_length=256,
                                save_total_limit = 1,
                               fp16=True)

trainer = Seq2SeqTrainer(model=model,
                        args=args,
                        train_dataset=dataset["train"],
                        eval_dataset=dataset["valid"],
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,
                        data_collator = data_collator
                        )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
torch.cuda.empty_cache()
torch.cuda.memory_allocated()
PYTORCH_CUDA_ALLOC_CONF=expandable_segments=256

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvmdiioenathn[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
