In [None]:
!pip install evaluate
!pip install datase
!pip install transformers --upgrade
!pip install bert_score
!pip install rouge_score

In [None]:
!git clone https://[username]:[token]@github.com/nicolovergaro/DNLP_project.git

In [None]:
%cd /kaggle/working/DNLP_project

In [None]:
!unzip microMiscPubSumm.zip

In [None]:
import random
import torch
import evaluate
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, random_split
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer,AutoTokenizer,AutoModelForSeq2SeqLM
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

from utils.reproducibility import *
from utils.datasets import *

In [None]:
# download and load the base model and the associated tokenizer
model_name="sshleifer/distilbart-cnn-12-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# num_samples = 1800
make_it_reproducible()
train_ds = TitleGenDataset("microMiscPubSumm_train.json", tokenizer, 1024, 128, use_highlights=True, use_abstract=True)
# train_ds = TitleGenDataset("microMiscPubSumm_train.json", tokenizer, 1024, 128, use_highlights=False, use_abstract=True)
# train_ds = TitleGenDataset("microMiscPubSumm_train.json", tokenizer, 1024, 128, use_highlights=True, use_abstract=False)
# micro_train_ds, _ = random_split(train_ds, [num_samples, len(train_ds)-num_samples], generator=get_generator())
test_ds = TitleGenDataset("microMiscPubSumm_test.json", tokenizer, 1024, 128, use_highlights=True, use_abstract=True)
# test_ds = TitleGenDataset("microMiscPubSumm_test.json", tokenizer, 1024, 128, use_highlights=False, use_abstract=True)
# test_ds = TitleGenDataset("microMiscPubSumm_test.json", tokenizer, 1024, 128, use_highlights=True, use_abstract=False)
# micro_test_ds, _ = random_split(test_ds, [int(num_samples*0.2), len(test_ds)-int(num_samples*0.2)], generator=get_generator())

In [None]:
# import evaluation metrics
bertscore = evaluate.load("bertscore")  # semantic
rouge = evaluate.load("rouge")  # syntactic


def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels


def compute_metric(pred):  # compute evaluation metrics, even though we are just doing 1 only epoch
    label_ids = pred.label_ids
    pred_ids = pred.predictions[0]

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    print("pred:", pred_str[0], "\n original:", label_str[0])

    rg_out = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"])

    bs_res = bertscore.compute(predictions=pred_str, references=label_str, lang="en")

    return {
        "bertscore": round(np.mean(bs_res["recall"]), 4),
        "R1": round(rg_out["rouge1"], 4),
        "R2": round(rg_out["rouge2"], 4),
        "RL": round(rg_out["rougeL"], 4),
        "RLsum": round(rg_out["rougeLsum"], 4)
    }

In [None]:
# define the training arguments and the trainer itself
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=0.1 * len(train_ds) / 8,  # on kaggle we have 2 gpus with a batch size of 4
    learning_rate=5e-5,
    weight_decay=1e-2,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bertscore",
    report_to="none",
    gradient_accumulation_steps = 1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metric,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
trainer.train()

In [None]:
# save the model to download and then upload it on huggingface
trainer.save_model("/kaggle/working/final_model_misc")

# Test 1

In [None]:
# evaluation of the performance on the complete test set (~2500 papers, mainly bio)

rouge1 = 0
rouge2 = 0
bs = 0

md = trainer.model

i = 0

for data in tqdm(test_ds):   
    
    input_ids , labels = data["input_ids"], data["labels"]
    
    original_text = tokenizer.decode(labels, skip_special_tokens=True)
    
    outs = md.generate(input_ids.unsqueeze(dim=0).to("cuda"), num_beams=5, min_length=3, max_length=32)
    
    pred_text = tokenizer.decode(outs[0], skip_special_tokens=True)
    
    rg = rouge.compute(predictions=[pred_text], references=[original_text], rouge_types=["rouge1", "rouge2"])
    rouge1 += rg["rouge1"]
    rouge2 += rg["rouge2"]
    bs += np.mean(bertscore.compute(predictions=[pred_text], references=[original_text], lang="en")["recall"])
    
rouge1 /= len(test_ds)
rouge2 /= len(test_ds)
bs /= len(test_ds)

print(f"""rouge1: {rouge1}
rouge2: {rouge2}
bertscore: {bs}""")

# THExt test

In [None]:
s = "We propose a novel Transformer-based Highlights Extractor (THExt, in short). We achieve performance superior to state-of-the-art highlights extraction methods. Section-level context encoding turns out to be very effective for sentence ranking. Highlights are short sentences used to annotate scientific papers. They complement the abstract content by conveying the main result findings. To automate the process of paper annotation, highlights extraction aims at extracting from 3 to 5 paper sentences via supervised learning. Existing approaches rely on ad hoc linguistic features, which depend on the analyzed context, and apply recurrent neural networks, which are not effective in learning long-range text dependencies. This paper leverages the attention mechanism adopted in transformer models to improve the accuracy of sentence relevance estimation. Unlike existing approaches, it relies on the end-to-end training of a deep regression model. To attend patterns relevant to highlights content it also enriches sentence encodings with a section-level contextualization. The experimental results, achieved on three different benchmark datasets, show that the designed architecture is able to achieve significant performance improvements compared to the state-of-the-art."
input_ids = tokenizer.encode(s, return_tensors="pt")


outs = md.generate(input_ids.to("cuda"), num_beams=5, min_length=3, max_length=32)    
pred_text = tokenizer.decode(outs[0], skip_special_tokens=True)

In [None]:
pred_text