In [2]:
import pandas as pd
import sys, os.path
from torch import nn
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import nltk
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusTokenizer
import datetime
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from functools import cache
from typing import List
import torch
import pandas as pd
from tqdm import tqdm
from transformers import pipeline 

sys.path.append('../glimpse/evaluate/')
from evaluate_common_metrics_samples import evaluate_rouge
from evaluate_bartbert_metrics import evaluate_bartbert

Please run data processing if the folder ../data/processed doesn't contain processed files

In [8]:
def prepare_dataset(dataset_path) -> Dataset: 
    try:
        dataset = pd.read_csv(dataset_path)
    except:
        raise ValueError(f"Unknown dataset {dataset_path}")

    # make a dataset from the dataframe
    dataset = Dataset.from_pandas(dataset)

    return dataset

def evaluate_summarizer(dataset: Dataset) -> Dataset:
    """
    @param dataset: A dataset with the text
    @return: The same dataset with the summaries added
    """

    # generate summaries
    summaries = []
    print("Generating summaries...")

    # (tqdm library for progress bar) 
    for sample in tqdm(dataset):
        text = sample["text"] 
        
        text = text.replace('-----', '\n')
        sentences = nltk.sent_tokenize(text)
        # remove empty sentences
        sentences = [sentence for sentence in sentences if sentence != ""]
        summaries.append(sentences)

    # add summaries to the huggingface dataset
    dataset = dataset.map(lambda example: {"summary": summaries.pop(0)})

    return dataset

def parse_summaries(summaries_dataset:Dataset) -> pd.DataFrame:
    
    try:
        summaries = summaries_dataset
    except:
        raise ValueError(f"Unknown dataset! Error with summaries")

    # check if the dataframe has the right columns
    if not all(
        col in summaries.columns for col in ["index", "id", "text", "gold", "summary", "id_candidate"]
    ):
        raise ValueError(
            "The dataframe must have columns ['index', 'id', 'text', 'gold', 'summary', 'id_candidate']"
        )

    return summaries

def compute_rsa(summaries: pd.DataFrame, model, tokenizer, device):
    results = []
    for name, group in tqdm(summaries.groupby(["id"])):
        rsa_reranker = RSAReranking(
            model,
            tokenizer,
            device=device,
            candidates=group.summary.unique().tolist(),
            source_texts=group.text.unique().tolist(),
            batch_size=32,
            rationality=3,
        )
        
        (
            best_rsa,
            best_base,
            speaker_df,
            listener_df,
            initial_listener,
            language_model_proba_df,
            initial_consensuality_scores,
            consensuality_scores,
        ) = rsa_reranker.rerank(t=2)

        gold = group['gold'].tolist()[0]

        results.append(
            {
                "id": name,
                "best_rsa": best_rsa,  # best speaker score
                "best_base": best_base,  # naive baseline
                "speaker_df": speaker_df,  # all speaker results
                "listener_df": listener_df,  # all listener results (chances of guessing correctly)
                "initial_listener": initial_listener,
                "language_model_proba_df": language_model_proba_df,
                "initial_consensuality_scores": initial_consensuality_scores,
                "consensuality_scores": consensuality_scores,  # uniqueness scores
                "gold": gold,
                "rationality": 3,  # hyperparameter
                "text_candidates" : group
            }
        )

    return results

## Generate Extractive Summaries

In [6]:
dataset_path = "../data/processed/all_reviews_2017.csv"
year = dataset_path.split("/")[-1].split("_")[-1].split(".")[0]
print(f"Using {year} dataset")
# |indexes| of reviews selected
limit = None

# prepare the dataset
dataset = prepare_dataset(dataset_path)

#limit the number of samples
if limit is not None:
    _lim = min(limit, len(dataset))
    dataset = dataset.select(range(_lim))

# generate summaries
dataset = evaluate_summarizer(dataset)

df_dataset = dataset.to_pandas()
df_dataset = df_dataset.explode("summary")
df_dataset = df_dataset.reset_index()
# add an idx with  the id of the summary for each example
df_dataset["id_candidate"] = df_dataset.groupby(["index"]).cumcount()

# removing missing values
if df_dataset.isnull().values.sum() > 0:
    df_dataset.dropna(axis=0,inplace=True)
    assert df_dataset.isnull().values.sum() == 0, "Missing Values!"

Using ita dataset
Generating summaries...


100%|██████████| 18/18 [00:00<00:00, 557.31it/s]


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [6]:
# save the dataset
# create output dir if it doesn't exist
output_dir = "../data/candidates/" 
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_dataset.to_csv(f"{output_dir}extractive_summaries{year}.csv", index=False, encoding="utf-8")

# Generate Abstractive summaries

In [3]:
GENERATION_CONFIGS = {
    "top_p_sampling": {
        "max_new_tokens": 200,
        "do_sample": True,
        "top_p": 0.95,
        "temperature": 1.0,
        "num_return_sequences": 8,
        "num_beams" : 1,

        #"num_beam_groups" : 4,
    },
    **{
        f"sampling_topp_{str(topp).replace('.', '')}": {
            "max_new_tokens": 200,
            "do_sample": True,
            "num_return_sequences": 8,
            "top_p": 0.95,
        }
        for topp in [0.5, 0.8, 0.95, 0.99]
    },
}

# add base.csv config to all configs
for key, value in GENERATION_CONFIGS.items():
    GENERATION_CONFIGS[key] = {
        # "max_length": 2048,
        "min_length": 0,
        "early_stopping": True,
        **value,
    }


def prepare_dataset(dataset_path) -> Dataset:
    try:
        dataset = pd.read_csv(dataset_path)
    except:
        raise ValueError(f"Unknown dataset {dataset_path}")

    # make a dataset from the dataframe
    dataset = Dataset.from_pandas(dataset)

    return dataset


def evaluate_summarizer(
    model, tokenizer, dataset: Dataset, decoding_config, batch_size: int,
    device: str, trimming: bool
) -> Dataset:
    """
    @param model: The model used to generate the summaries
    @param tokenizer: The tokenizer used to tokenize the text and the summary
    @param dataset: A dataset with the text
    @param decoding_config: Dictionary with the decoding config
    @param batch_size: The batch size used to generate the summaries
    @return: The same dataset with the summaries added
    """
    # create a dataset with the text and the summary

    # create a dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=trimming)
    # generate summaries
    summaries = []
    print("Generating summaries...")

    for batch in tqdm(dataloader):
        text = batch["text"]

        inputs = tokenizer(
            text,
            max_length=1024,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        
        # move inputs to device
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # generate summaries
        outputs = model.module.generate(
            **inputs,
            **decoding_config,
        )

        
        total_size = outputs.numel()  # Total number of elements in the tensor
        target_size = batch_size * outputs.shape[-1]  # Target size of the last dimension
        pad_size = (target_size - (total_size % target_size)) % target_size  # Calculate the required padding size to make the total number of elements divisible by the target size

        # Pad the tensor with zeros to make the total number of elements divisible by the target size
        if not trimming and pad_size != 0: outputs = torch.nn.functional.pad(outputs, (0, 0, 0, pad_size // outputs.shape[-1]))

        # output : (batch_size * num_return_sequences, max_length)
        try:
            outputs = outputs.reshape(batch_size, -1, outputs.shape[-1])
        except Exception as e:
            print(f"Error reshaping outputs: {e}")
            raise ValueError(f"Cannot reshape tensor of size {outputs.numel()} into shape "
                            f"({batch_size}, -1, {outputs.shape[-1]}).")
        
    
        
        # decode summaries
        for b in range(batch_size):
            summaries.append(
                [
                    tokenizer.decode(
                        outputs[b, i],
                        skip_special_tokens=True,
                    )
                    for i in range(outputs.shape[1])
                ]
            )

    # if trimming the last batch, remove them from the dataset
    if trimming: dataset = dataset.select(range(len(summaries)))
    
    # add summaries to the huggingface dataset
    dataset = dataset.map(lambda example: {"summary": summaries.pop(0)})
    
    return dataset


def sanitize_model_name(model_name: str) -> str:
    """
    Sanitize the model name to be used as a folder name.
    @param model_name: The model name
    @return: The sanitized model name
    """
    return model_name.replace("/", "_")

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using {device} device")

# Fixed configuration
model_name = "facebook/bart-large-cnn"
dataset_path = "/kaggle/working/processed/all_reviews_2017.csv"
decoding_config = "top_p_sampling"  # Assuming GENERATION_CONFIGS exists
batch_size = 32
trimming = True
output_dir = "/kaggle/working/candidates/"
limit = None # You can change this value
scripted_run = False

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# Multiple GPU
model = model.to(device)
if torch.cuda.device_count()>1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model=nn.DataParallel(model)

# Load the dataset
print("Loading dataset...")
dataset = prepare_dataset(dataset_path)

# Limit the number of samples
_lim = min(limit, len(dataset))
dataset = dataset.select(range(_lim))

# Generate summaries
dataset = evaluate_summarizer(
    model,
    tokenizer,
    dataset,
    GENERATION_CONFIGS[decoding_config],
    batch_size=batch_size,
    device=device,
    trimming=trimming,
)


df_dataset = dataset.to_pandas()
df_dataset = df_dataset.explode("summary").reset_index()
df_dataset['id_candidate'] = df_dataset.groupby(['index']).cumcount()

using cuda device


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using 2 GPUs!
Loading dataset...
Generating summaries...


100%|██████████| 20/20 [00:34<00:00,  1.71s/it]


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [5]:
# Save the dataset
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d-%H-%M-%S")
model_name_sanitized = sanitize_model_name(model_name)
padding_status = "trimmed" if trimming else "padded"

# save the dataset
# create output dir if it doesn't exist
output_dir = "../data/candidates/" 
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_dataset.to_csv(f"{output_dir}abstractive_summaries{year}.csv", index=False, encoding="utf-8")

## Compute RSA with Sentiment Analysis
Label each summary with sentiment, matching between sentiments may make identification of documente during rsa more accurate

In [9]:
def filter_best_rsa_by_sentiment(best_rsa, meta_review, rsareranker):
    """
    MODIFICA: Filter the list of candidate summaries (best_rsa) keeping only those whose sentiment corresponds 
    to that of the metareview
    """
    meta_sentiment = rsareranker.get_sentiment(meta_review)
    filtered_candidates = []
    for candidate in best_rsa:
        candidate_sentiment = rsareranker.get_sentiment(candidate)
        if candidate_sentiment == meta_sentiment:
            filtered_candidates.append(candidate)

    return filtered_candidates if len(filtered_candidates) > 0 else best_rsa



def kl_divergence(p, q):
    """
    Compute the KL divergence between two distributions
    """
    return torch.nan_to_num(p * (p / q).log(), nan=0.0).sum(-1)


def jensen_shannon_divergence(p, q):
    """
    Compute the Jensen-Shannon divergence between two distributions
    """
    m = 0.5 * (p + q)
    return 0.5 * (kl_divergence(p, m) + kl_divergence(q, m))


class RSAReranking:
    """
    Rerank a list of candidates according to the RSA model.
    MODIFICA: Integrazione del sentiment analysis per aggiungere bonus al punteggio.
    """

    def __init__(
            self,
            model,
            tokenizer,
            candidates: List[str],
            source_texts: List[str],
            batch_size: int = 32,
            rationality: int = 1,
            device="cpu",
            sentiment_weight: float = 1.0,
    ):
        """
        :param model: hf model used to compute the likelihoods (supposed to be a seq2seq model), is S0 in the RSA model
        :param tokenizer:
        :param candidates: list of candidate summaries
        :param source_texts: list of source texts (ad es. metareview gold)
        :param batch_size: batch size used to compute the likelihoods (can be high since we don't need gradients and it's a single forward pass)
        :param rationality: rationality parameter of the RSA model
        :param device: device used to compute the likelihoods
        :param sentiment_weight: fattore per integrare il bonus del sentiment
        """
        self.model = model
        self.device = device
        self.tokenizer = tokenizer

        self.candidates = candidates
        self.source_texts = source_texts

        self.batch_size = batch_size
        self.rationality = rationality
        self.sentiment_weight = sentiment_weight 

        # Sentiment analysis pipeline
        self.sentiment_analyzer = pipeline(
            "sentiment-analysis",
            device=0 if device != "cpu" else -1
        )

    def compute_conditionned_likelihood(
            self, x: List[str], y: List[str], mean: bool = True
    ) -> torch.Tensor:
        """
        Compute the likelihood of y given x

        :param x: list of source texts len(x) = batch_size
        :param y: list of candidate summaries len(y) = batch_size
        :param mean: average the likelihoods over the tokens of y or take the sum
        :return: tensor of shape (batch_size) containing the likelihoods of y given x
        """

        assert len(x) == len(y), "x and y must have the same length"

        loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
        batch_size = len(x)

        x_enc = self.tokenizer(x, return_tensors="pt", padding=True, truncation=True)
        y_enc = self.tokenizer(y, return_tensors="pt", padding=True, truncation=True)

        x_ids = x_enc.input_ids.to(self.device)
        y_ids = y_enc.input_ids.to(self.device)

        logits = self.model(
            input_ids=x_ids,
            decoder_input_ids=y_ids,
            attention_mask=x_enc.attention_mask.to(self.device),
            decoder_attention_mask=y_enc.attention_mask.to(self.device),
        ).logits

        shifted_logits = logits[..., :-1, :].contiguous()
        shifted_ids = y_ids[..., 1:].contiguous()

        likelihood = -loss_fn(
            shifted_logits.view(-1, shifted_logits.size(-1)), shifted_ids.view(-1)
        )

        likelihood = likelihood.view(batch_size, -1).sum(-1)
        if mean:
            likelihood /= (y_ids != self.tokenizer.pad_token_id).float().sum(-1)

        return likelihood

    def get_sentiment(self, text: str, max_length: int = 512) -> str:
        """
        MODIFICA: sentiment of a text. If it is longer of max_ length token, segment it into chunks
        """
        tokens = self.tokenizer.tokenize(text)
        if len(tokens) > max_length:
            chunks = [
                self.tokenizer.convert_tokens_to_string(tokens[i:i+max_length])
                for i in range(0, len(tokens), max_length)
            ]
            sentiments = []
            for chunk in chunks:
                result = self.sentiment_analyzer(chunk, truncation=True)[0]
                sentiments.append(result['label'])
            sentiment = max(set(sentiments), key=sentiments.count)
        else:
            sentiment = self.sentiment_analyzer(text, truncation=True)[0]['label']
        return sentiment

    def compute_sentiment_bonus(self, sources: List[str], candidates: List[str]) -> torch.Tensor:
        """
        MODIFICA: Compute sentiment bonus based on the aggregated sentiment
        """
        bonus_list = []
        for s_text, c_text in zip(sources, candidates):
            s_sentiment = self.get_sentiment(s_text)
            c_sentiment = self.get_sentiment(c_text)
            if s_sentiment == c_sentiment:
                bonus_list.append(1.0)
            else:
                bonus_list.append(-1.0)
        return torch.tensor(bonus_list).to(self.device)



    def score(self, x: List[str], y: List[str], **kwargs):
        """
        MODIFICA: Comine the likelihood with the sentiment bonus
        """
        base_score = self.compute_conditionned_likelihood(x, y, **kwargs)
        sentiment_bonus = self.compute_sentiment_bonus(x, y)
        return base_score + self.sentiment_weight * sentiment_bonus

    def likelihood_matrix(self) -> torch.Tensor:
        """
        :return: likelihood matrix : (num_source_texts, num_candidates),
        dove likelihood[i, j] è la likelihood del candidato j per il source text i.
        """
        likelihood_matrix = torch.zeros(
            (len(self.source_texts), len(self.candidates))
        ).to(self.device)

        pairs = []
        for i, source_text in enumerate(self.source_texts):
            for j, candidate in enumerate(self.candidates):
                pairs.append((i, j, source_text, candidate))

        # split batch pairs
        batches = [
            pairs[i: i + self.batch_size]
            for i in range(0, len(pairs), self.batch_size)
        ]

        for batch in tqdm(batches):
            batch_sources = [pair[2] for pair in batch]
            batch_candidates = [pair[3] for pair in batch]

            with torch.no_grad():
                likelihoods = self.score(batch_sources, batch_candidates, mean=True)

            for k, (i, j, _, _) in enumerate(batch):
                likelihood_matrix[i, j] = likelihoods[k].detach()

        return likelihood_matrix

    @cache
    def S(self, t):
        if t == 0:
            return self.initial_speaker_probas
        else:
            listener = self.L(t - 1)
            prod = listener * self.rationality
            return torch.log_softmax(prod, dim=-1)

    @cache
    def L(self, t):
        speaker = self.S(t)
        return torch.log_softmax(speaker, dim=-2)

    def mk_listener_dataframe(self, t):
        self.initial_speaker_probas = self.likelihood_matrix()

        initial_listener_probas = self.L(0)

        uniform_distribution_over_source_texts = torch.ones_like(
            initial_listener_probas
        ) / len(self.source_texts)

        initital_consensuality_score = (
                torch.exp(initial_listener_probas)
                * (initial_listener_probas - torch.log(uniform_distribution_over_source_texts))
        ).sum(0).cpu().numpy()

        initital_consensuality_score = pd.Series(initital_consensuality_score, index=self.candidates)

        initial_listener_probas = initial_listener_probas.cpu().numpy()
        initial_listener_probas = pd.DataFrame(initial_listener_probas)
        initial_listener_probas.index = self.source_texts
        initial_listener_probas.columns = self.candidates

        initial_speaker_probas = self.S(0).cpu().numpy()
        initial_speaker_probas = pd.DataFrame(initial_speaker_probas)
        initial_speaker_probas.index = self.source_texts
        initial_speaker_probas.columns = self.candidates

        listener_df = pd.DataFrame(self.L(t).cpu().numpy())
        consensuality_scores = (
                torch.exp(self.L(t))
                * (self.L(t) - torch.log(uniform_distribution_over_source_texts))
        ).sum(0).cpu().numpy()
        consensuality_scores = pd.Series(consensuality_scores, index=self.candidates)

        S_mat = self.S(t).cpu().numpy()
        speaker_df = pd.DataFrame(S_mat)

        listener_df.index = self.source_texts
        speaker_df.index = self.source_texts
        listener_df.columns = self.candidates
        speaker_df.columns = self.candidates

        return listener_df, speaker_df, initial_listener_probas, initial_speaker_probas, initital_consensuality_score, consensuality_scores

    def rerank(self, t=1):
        """
        Return the best summary (secondo RSA) per source text.
        """
        (
            listener_df,
            speaker_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
        ) = self.mk_listener_dataframe(t=t)
        best_rsa = speaker_df.idxmax(axis=1).values
        best_base = initial_listener_proba.idxmax(axis=1).values

        return (
            best_rsa,
            best_base,
            speaker_df,
            listener_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
        )


In [10]:
def compute_rsa(summaries: pd.DataFrame, model, tokenizer, device):
    results = []
    for name, group in tqdm(summaries.groupby(["id"])):
        rsa_reranker = RSAReranking(
            model,
            tokenizer,
            device=device,
            candidates=group.summary.unique().tolist(),
            source_texts=group.text.unique().tolist(),
            batch_size=16,
            rationality=3,
            sentiment_weight=1.0  
        )
        
        (
            best_rsa,
            best_base,
            speaker_df,
            listener_df,
            initial_listener,
            language_model_proba_df,
            initial_consensuality_scores,
            consensuality_scores,
        ) = rsa_reranker.rerank(t=2)

        gold = group['gold'].tolist()[0]
        
        best_rsa_filtered = filter_best_rsa_by_sentiment(best_rsa, gold, rsa_reranker)
        
        results.append(
            {
                "id": name,
                "best_rsa": best_rsa_filtered,  
                "best_base": best_base,  
                "speaker_df": speaker_df,
                "listener_df": listener_df,
                "initial_listener": initial_listener,
                "language_model_proba_df": language_model_proba_df,
                "initial_consensuality_scores": initial_consensuality_scores,
                "consensuality_scores": consensuality_scores,
                "gold": gold,
                "rationality": 3, 
                "text_candidates": group
            }
        )

    return results



# download model
model_name = "google/pegasus-arxiv"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using {device} device")

# load the model and the tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

if "pegasus" in model_name:
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

output_path = "../output/quality/"
# load the summaries
summaries = parse_summaries(df_dataset)
n_articles = 5
selected_articles = list(summaries.groupby("id").count().index[:n_articles])
mask = [summaries["id"][i] in selected_articles for i in range(len(summaries))]
selected_summaries = summaries[mask]
assert len(selected_summaries.groupby("id").count()) == n_articles, "Error in selecting articles!"
print(f"using a dataset with {len(selected_summaries.groupby('id').count())} articles")

results = compute_rsa(selected_summaries, model, tokenizer, device)

results = {"results": results}
results["metadata/reranking_model"] = model_name
results["metadata/rsa_iterations"] = 3

print("Best Summaries generated succesfully!")

# save dataframe with base summaries
all_base_df = pd.DataFrame(results["results"])
display(all_base_df)
base_df = all_base_df.loc[:, ["id", "best_rsa", "gold"]]
display(base_df)
base_df.to_csv(f"{output_path}base_glimpse_{year}_{n_articles}samples.csv", index=False)



using cuda device
Using 2 GPUs!
using a dataset with 5 articles


  0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.31s/it][A
 40%|████      | 2/5 [00:03<00:05,  1.78s/it][A
 60%|██████    | 3/5 [00:04<00:03,  1.55s/it][A
 80%|████████  | 4/5 [00:05<00:01,  1.25s/it][A
100%|██████████| 5/5 [00:06<00:00,  1.27s/it][A
 20%|██        | 1/5 [00:10<00:42, 10.69s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.54s/it][A
100%|██████████| 2/2 [00:02<00:00,  1.47s/it][A
 40%|████      | 2/5 [00:14<00:19,  6.38s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.13it/s][A
 40%|████      | 2/5 [00:02<00:03,  1.06s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.16s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.28s/it][A
100%|██████████| 5/5 [00:05<00:00,  1.13s/it][A
 60%|██████    | 3/5 [00:19<00:12,  6.17s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:01<00:04,  1.49s/it][A
 50%|█████     | 2/4 [00:03<00:03,  1.53s/it][A
 75%|██████

Best Summaries generated succesfully!


Unnamed: 0,id,best_rsa,best_base,speaker_df,listener_df,initial_listener,language_model_proba_df,initial_consensuality_scores,consensuality_scores,gold,rationality,text_candidates
0,"(https://openreview.net/forum?id=B1jnyXXJx,)",[This paper proposes a regularizer that is cla...,[This paper proposes a regularizer that is cla...,...,...,...,...,The paper proposes a method that helps escape ...,The paper proposes a method that helps escape ...,The paper proposes a method for accelerating o...,3,index ...
1,"(https://openreview.net/forum?id=BJO-BuT1g,)",[This paper introduces an elegant method to tr...,[Paper addresses problem of efficient neural s...,...,...,...,...,It is done by learning a smaller set of parame...,It is done by learning a smaller set of parame...,The reviewers (two of whom stated maximum conf...,3,index ...
2,"(https://openreview.net/forum?id=BkCPyXm1l,)",[Aims to tackle neural network regularization ...,[Aims to tackle neural network regularization ...,...,...,...,...,The authors introduced a regularization scheme...,The authors introduced a regularization scheme...,The reviewers unanimously recommend rejection.,3,index ...
3,"(https://openreview.net/forum?id=S1HEBe_Jl,)",[The submission proposes to modify the typical...,[The submission proposes to modify the typical...,...,...,...,...,The submission proposes to modify the typical ...,The submission proposes to modify the typical ...,Interesting paper but not over the accept bar.,3,index ...
4,"(https://openreview.net/forum?id=S1J0E-71l,)",[Author's paper proposes to use surprisal-driv...,[Author's paper proposes to use surprisal-driv...,...,...,...,...,Review of a new paper on how to use surprising...,Review of a new paper on how to use surprising...,"Based on the feedback, I'm going to be rejecti...",3,index ...


Unnamed: 0,id,best_rsa,gold
0,"(https://openreview.net/forum?id=B1jnyXXJx,)",[This paper proposes a regularizer that is cla...,The paper proposes a method for accelerating o...
1,"(https://openreview.net/forum?id=BJO-BuT1g,)",[This paper introduces an elegant method to tr...,The reviewers (two of whom stated maximum conf...
2,"(https://openreview.net/forum?id=BkCPyXm1l,)",[Aims to tackle neural network regularization ...,The reviewers unanimously recommend rejection.
3,"(https://openreview.net/forum?id=S1HEBe_Jl,)",[The submission proposes to modify the typical...,Interesting paper but not over the accept bar.
4,"(https://openreview.net/forum?id=S1J0E-71l,)",[Author's paper proposes to use surprisal-driv...,"Based on the feedback, I'm going to be rejecti..."


# Rouge Score

In [12]:
# input base_df.astype("str") to get base scores, new_df to get improved ones
metrics = evaluate_rouge(base_df.astype("str"))
df = pd.DataFrame.from_dict(metrics)
# scores for the base model
print("Base Glimpse Scores")
print(df)
df.mean(axis=0)

Base Glimpse Scores
     rouge1    rouge2    rougeL  rougeLsum
0  0.410714  0.126126  0.205357   0.205357
1  0.166667  0.000000  0.111111   0.111111
2  0.025641  0.000000  0.025641   0.025641
3  0.062992  0.000000  0.062992   0.062992
4  0.260536  0.038610  0.130268   0.130268


rouge1       0.185310
rouge2       0.032947
rougeL       0.107074
rougeLsum    0.107074
dtype: float64

# Bartbert Score

In [14]:
display(base_df)
metrics = evaluate_bartbert(base_df.astype("str"))
# make a dataframe with the metric
df = pd.DataFrame(metrics)
# base model bartbert scores
print(df)
df.mean(axis=0)

Unnamed: 0,id,best_rsa,gold
0,"(https://openreview.net/forum?id=B1jnyXXJx,)",[This paper proposes a regularizer that is cla...,The paper proposes a method for accelerating o...
1,"(https://openreview.net/forum?id=BJO-BuT1g,)",[This paper introduces an elegant method to tr...,The reviewers (two of whom stated maximum conf...
2,"(https://openreview.net/forum?id=BkCPyXm1l,)",[Aims to tackle neural network regularization ...,The reviewers unanimously recommend rejection.
3,"(https://openreview.net/forum?id=S1HEBe_Jl,)",[The submission proposes to modify the typical...,Interesting paper but not over the accept bar.
4,"(https://openreview.net/forum?id=S1J0E-71l,)",[Author's paper proposes to use surprisal-driv...,"Based on the feedback, I'm going to be rejecti..."


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

   BERTScore
0   0.852712
1   0.828757
2   0.822575
3   0.814051
4   0.823560


BERTScore    0.828331
dtype: float64