In [9]:
import datasets
import pandas as pd
from IPython.display import display, HTML
from datasets import ClassLabel
from transformers import BertTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments,AutoTokenizer
from transformers import EncoderDecoderModel
from transformers import BertTokenizer
from rouge import Rouge
import numpy as np
from hazm import *

In [27]:
train_data = datasets.load_dataset(
    "pn_summary", split="train", download_mode="force_redownload"
)

pn_summary.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

pn_summary.zip:   0%|          | 0.00/89.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/82022 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5592 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5593 [00:00<?, ? examples/s]

In [28]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

In [29]:
normalizer = Normalizer()

def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    for i in range(len(batch['article'])):
      batch['article'][i]=normalizer.normalize(batch['article'][i])
      batch['summary'][i]=normalizer.normalize(batch['summary'][i])

    inputs = tokenizer(
        batch["article"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
    )
    outputs = tokenizer(
        batch["summary"],
        padding="max_length",
        truncation=True,
        max_length=decoder_max_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [0 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [30]:
metric = Rouge()
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    labels = np.where(labels != 0, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute ROUGE scores
    result = metric.get_scores(decoded_preds, decoded_labels,avg=True)
    print(result)
    with open("result_bert2bert.txt","a") as handler:
        handler.write(f'rouge-1 - recall: {result["rouge-1"]["r"]} - precision: {result["rouge-1"]["p"]} - fscore: {result["rouge-1"]["f"]}\n')
        handler.write(f'rouge-2 - recall: {result["rouge-2"]["r"]} - precision: {result["rouge-2"]["p"]} - fscore: {result["rouge-2"]["f"]}\n')
        handler.write(f'rouge-l - recall: {result["rouge-l"]["r"]} - precision: {result["rouge-l"]["p"]} - fscore: {result["rouge-l"]["f"]}\n\n')
    # Extract F-measure for each ROUGE score
    rouge_result = {
        "rouge1": result["rouge-1"]["f"],
        "rouge2": result["rouge-2"]["f"],
        "rougeL": result["rouge-l"]["f"],
    }

    return rouge_result

In [31]:
sample_size = 10000
encoder_max_length = 512
decoder_max_length = 128
#train_data = train_data.select(range(1000))
# batch_size = 16
batch_size = 4

In [32]:
train_data = train_data.map(
        process_data_to_model_inputs,
        batched=True,
        batch_size=batch_size,
        remove_columns=['id', 'title', 'article', 'summary', 'category', 'categories', 'network', 'link'],
    )

Map:   0%|          | 0/82022 [00:00<?, ? examples/s]

In [33]:
train_data.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"],
    )

In [34]:
val_data = datasets.load_dataset(
        "pn_summary", split="validation"
    )

In [35]:
val_data = val_data.map(
        process_data_to_model_inputs,
        batched=True,
        batch_size=batch_size,
        remove_columns=['id', 'title', 'article', 'summary', 'category', 'categories', 'network', 'link'],
    )

Map:   0%|          | 0/5592 [00:00<?, ? examples/s]

In [36]:
val_data.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"],
    )

In [37]:
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "HooshvareLab/bert-base-parsbert-uncased", "HooshvareLab/bert-base-parsbert-uncased"
    )
bert2bert.save_pretrained("bert2bert")
bert2bert = EncoderDecoderModel.from_pretrained("bert2bert")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertLMHeadModel were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.ke

In [38]:
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
bert2bert.config.max_length = 128
bert2bert.config.min_length = 0
bert2bert.config.no_repeat_ngram_size = 2
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 3

In [39]:
training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        evaluation_strategy="steps",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        fp16=True,
        output_dir="bert2bert_model",
        logging_steps=2,
        save_steps=10000,
        eval_steps=10000,
        num_train_epochs=5,
        report_to="none",
        warmup_steps=1000,
    )



In [None]:
trainer = Seq2SeqTrainer(
        model=bert2bert,
        tokenizer=tokenizer,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_data,
        eval_dataset=val_data,
    )
trainer.train()

  trainer = Seq2SeqTrainer(
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss,Validation Loss
