In [None]:
!pip install evaluate
!pip install transformers --upgrade
!pip install bert_score
!pip install rouge_score
!pip uninstall huggingface_hub
!pip install huggingface_hub --upgrade

In [None]:
!git clone https://[USERNAME]:[TOKEN]@github.com/nicolovergaro/DNLP_project.git

In [None]:
%cd /kaggle/working/DNLP_project

In [None]:
!unzip microCSPubSumm.zip
!unzip microBIOPubSumm.zip
!unzip microAIPubSumm.zip
!unzip microMiscPubSumm.zip

In [None]:
import random
import torch
import evaluate
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, random_split
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from huggingface_hub import login, logout

from utils.reproducibility import *
from utils.datasets import *

In [None]:
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

make_it_reproducible()
# misc datasets
train_ds = TitleGenDataset("microMiscPubSumm_train.json", tokenizer, 1024, 32)
# to reduce the train dataset to work with to 2000 elements
micro_train_ds, _ = random_split(train_ds, [2000, len(train_ds)-2000], generator=get_generator())
test_ds = TitleGenDataset("microMiscPubSumm_test.json", tokenizer, 1024, 32)
# to reduce the test dataset to work with to 200 elements
micro_test_ds, _ = random_split(test_ds, [200, len(test_ds)-200], generator=get_generator())

In [None]:
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")

# function to extract the encodings predicted by the model
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

# function to compute the metric on which the trainer decides the best model
def compute_metric(pred):
    label_ids = pred.label_ids
    pred_ids = pred.predictions[0]

    # extraction of the 2 strings (predicted and original)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # for debug porpuses
    print("pred:", pred_str[0], "\n original:", label_str[0])

    rg_out = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"])

    bs_res = bertscore.compute(predictions=pred_str, references=label_str, lang="en")

    return {
        "bertscore": round(np.mean(bs_res["recall"]), 4),
        "R1": round(rg_out["rouge1"], 4),
        "R2": round(rg_out["rouge2"], 4),
        "RL": round(rg_out["rougeL"], 4),
        "RLsum": round(rg_out["rougeLsum"], 4)
    }

## First tuning on misc-dataset

In [None]:
# original bart distil model
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    learning_rate=5e-5,
    weight_decay=1e-2,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bertscore"  # change to choose based on other metrics like R1, R2, RL, RLsum
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=micro_train_ds,
    eval_dataset=micro_test_ds,
    compute_metrics=compute_metric,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
make_it_reproducible()
trainer.train()

In [None]:
# save the model locally
trainer.save_model("./misc_model")

In [None]:
# login to huggingface to save the model online
login("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")

trainer.model.push_to_hub("titlist-bart-misc-2000")

logout()

# Tuning on AI

In [None]:
# ai datasets
ai_ds = TitleGenDataset("microAIPubSumm_train.json", tokenizer, 1024, 32)
ai_ds_test = TitleGenDataset("microAIPubSumm_test.json", tokenizer, 1024, 32)

In [None]:
login("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")  # read token, the model is private
model = BartForConditionalGeneration.from_pretrained("pietrocagnasso/titlist-bart-misc-2000")

training_args = TrainingArguments(
    output_dir="./results_ai",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=60,
    learning_rate=5e-5,
    weight_decay=1e-2,
    logging_dir="./logs_ai",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bertscore"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ai_ds,
    eval_dataset=ai_ds_test,
    compute_metrics=compute_metric,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
make_it_reproducible()
trainer.train()

In [None]:
trainer.save_model("./ai_model")

In [None]:
# login to huggingface to save the model online
login("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")

trainer.model.push_to_hub("titlist-bart-ai")

logout()

## Cagliero, La Quatra

In [None]:
s = "We propose a novel Transformer-based Highlights Extractor (THExt, in short). We achieve performance superior to state-of-the-art highlights extraction methods. Section-level context encoding turns out to be very effective for sentence ranking. Highlights are short sentences used to annotate scientific papers. They complement the abstract content by conveying the main result findings. To automate the process of paper annotation, highlights extraction aims at extracting from 3 to 5 paper sentences via supervised learning. Existing approaches rely on ad hoc linguistic features, which depend on the analyzed context, and apply recurrent neural networks, which are not effective in learning long-range text dependencies. This paper leverages the attention mechanism adopted in transformer models to improve the accuracy of sentence relevance estimation. Unlike existing approaches, it relies on the end-to-end training of a deep regression model. To attend patterns relevant to highlights content it also enriches sentence encodings with a section-level contextualization. The experimental results, achieved on three different benchmark datasets, show that the designed architecture is able to achieve significant performance improvements compared to the state-of-the-art."

ins = tokenizer.encode(s, return_tensors="pt")

In [None]:
model = BartForConditionalGeneration.from_pretrained("./ai_model")
model = BartForConditionalGeneration.from_pretrained("[MODEL_NAME]")  # the model pretrained on ai-dataset

model.to("cuda")
outs = model.generate(ins.to("cuda"), num_beams=10, min_length=5, max_length=20)

In [None]:
pred_str = tokenizer.batch_decode(outs, skip_special_tokens=True)

In [None]:
print(pred_str)

In [None]:
print("Transformer-based highlights extraction from scientific papers")