In [None]:
!pip install transformers datasets evaluate rouge_score accelerate -U

Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pya

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

In [None]:
# token = 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
# Login to Hugging Face to access private models and datasets
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load the dataset
xlsum = load_dataset("hezarai/xlsum-fa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
xlsum['train'][0]
# xlsum['test'][0]

{'text': 'اين دستور در پی افزايش فعاليت آتشفشان کوه مايون واقع در مرکز فيليپين در روز دوشنبه صادر شد. اين آتشفشان که آخرين بار در سال 2000 فوران کرد در يک ماه گذشته فعال بوده است. مايون فعال ترين آتشفشان در فيليپين است و در 400 سال گذشته در حدود 50 بار فوران کرده است. رناتو سوليدوم، مدير سازمان زلزله نگاری و آتشفشان شناسی فيليپين به خبرگزاری رويتر گفت: "لازم می دانيم سطح هشدار را به درجه چهار افزايش دهيم زيرا امروز صبح شاهد سه مورد انفجار خاکستر بوديم." اين سيستم هشدار دهنده به پنج درجه تقسيم می شود که پنج نشانه خطرناک ترين وضعيت است. "اين بدان معنی است که احتمال فورانی خطرناک وجود دارد." ارتش کاميون هايی برای انتقال روستاييانی که در شعاع 8 کيلومتری آتشفشان زندگی می کنند گسيل کرده است. خدمات',
 'summary': 'مقام های فيليپين دستور تخليه حدودا 20 هزار نفر از ساکنان ناحيه اطراف يک آتشفشان را صادر کرده اند چرا که نگرانند به زودی فوران کند.',
 'title': "آتشفشان فيليپين 'در آستانه فوران'"}

In [None]:
# Load the tokenizer
checkpoint = "HooshvareLab/pn-summary-b2b-shared"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, legacy=False)

In [None]:
# Define a prefix for the task
prefix = "summarize: "

# Preprocess function to tokenize inputs and targets
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Select subset
train_subset = xlsum['train'].select(range(5000))
test_subset = xlsum['test'].select(range(500))
del xlsum

In [None]:
# Apply the preprocess function to the dataset
tokenized_train = train_subset.map(preprocess_function, batched=True)
tokenized_test = test_subset.map(preprocess_function, batched=True)
del train_subset, test_subset

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Set up data collator for seq2seq and move to GPU
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
# Load the evaluation metric
rouge = evaluate.load("rouge")

In [None]:
# Define a function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Load the model and move to GPU
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to("cuda")

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned-b2b-with-xlsum",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    per_device_eval_batch_size=8,   # Adjust based on GPU memory
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,                      # Enable mixed precision training
    push_to_hub=True,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,       # tokenized_xlsum_train
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# Train the model on GPU
trainer.train()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.5667,3.401286,0.0,0.0,0.0,0.0,48.702
2,3.9598,3.282422,0.0,0.0,0.0,0.0,54.668
3,3.7221,3.241251,0.0,0.0,0.0,0.0,52.574
4,3.4583,3.231525,0.0,0.0,0.0,0.0,54.022


Non-default generation parameters: {'max_length': 128, 'min_length': 16, 'early_stopping': True, 'num_beams': 3, 'length_penalty': 2.0, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 128, 'min_length': 16, 'early_stopping': True, 'num_beams': 3, 'length_penalty': 2.0, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 128, 'min_length': 16, 'early_stopping': True, 'num_beams': 3, 'length_penalty': 2.0, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Non-default generation parameters: {'max_length': 128, 'min_length': 16, 'early_stopping': True, 'num_beams': 3, 'length_penalty': 2.0, 'no_repeat_ngram_size': 2}
  decoder_attention_mask = decoder_inpu

TrainOutput(global_step=2500, training_loss=3.85916748046875, metrics={'train_runtime': 2412.6177, 'train_samples_per_second': 8.29, 'train_steps_per_second': 1.036, 'total_flos': 7043946577920000.0, 'train_loss': 3.85916748046875, 'epoch': 4.0})