In [1]:
import argparse
import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from tqdm import tqdm

import nltk
import os

In [2]:
from functools import cache
from typing import List
import nltk
import numpy as np
import torch
import pandas as pd
from tqdm import tqdm


def kl_divergence(p, q):
    """
    Compute the KL divergence between two distributions
    """
    return torch.nan_to_num(p * (p / q).log(), nan=0.0).sum(-1)


def jensen_shannon_divergence(p, q):
    """
    Compute the Jensen-Shannon divergence between two distributions
    """
    m = 0.5 * (p + q)
    return 0.5 * (kl_divergence(p, m) + kl_divergence(q, m))


class RSAReranking:
    """
    Rerank a list of candidates according to the RSA model.
    """

    def __init__(
            self,
            model,
            tokenizer,
            candidates: List[str],
            source_texts: List[str],
            batch_size: int = 32,
            rationality: int = 1,
            device="cpu",
    ):
        """
        :param model: hf model used to compute the likelihoods (supposed to be a seq2seq model), is S0 in the RSA model
        :param tokenizer:
        :param candidates: list of candidates summaries
        :param source_texts: list of source texts
        :param batch_size: batch size used to compute the likelihoods (can be high since we don't need gradients and
        it's a single forward pass)
        :param rationality: rationality parameter of the RSA model
        :param device: device used to compute the likelihoods
        """
        self.model = model
        self.device = device
        self.tokenizer = tokenizer

        self.candidates = candidates
        self.source_texts = source_texts

        self.batch_size = batch_size
        self.rationality = rationality
        print("hello this is the test version!")

    def compute_conditionned_likelihood(
            self, x: List[str], y: List[str], mean: bool = True
    ) -> torch.Tensor:
        """
        Compute the likelihood of y given x

        :param x: list of source texts len(x) = batch_size
        :param y: list of candidates summaries len(y) = batch_size
        :param mean: average the likelihoods over the tokens of y or take the sum
        :return: tensor of shape (batch_size) containing the likelihoods of y given x
        """

        assert len(x) == len(y), "x and y must have the same length"

        loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
        batch_size = len(x)

        x = self.tokenizer(x, return_tensors="pt", padding=True, truncation=True)
        y = self.tokenizer(y, return_tensors="pt", padding=True, truncation=True)
        # Concatenate the two inputs
        # Compute the likelihood of y given x

        x_ids = x.input_ids.to(self.device)
        y_ids = y.input_ids.to(self.device)

        logits = self.model(
            input_ids=x_ids,
            decoder_input_ids=y_ids,
            attention_mask=x.attention_mask.to(self.device),
            decoder_attention_mask=y.attention_mask.to(self.device),
        ).logits
        # Compute the likelihood of y given x

        shifted_logits = logits[..., :-1, :].contiguous()
        shifted_ids = y_ids[..., 1:].contiguous()

        likelihood = -loss_fn(
            shifted_logits.view(-1, shifted_logits.size(-1)), shifted_ids.view(-1)
        ) #comment: maybe  quality_score can be implemented here

        likelihood = likelihood.view(batch_size, -1).sum(-1)
        if mean:
            likelihood /= (y_ids != self.tokenizer.pad_token_id).float().sum(-1)

        return likelihood

    def score(self, x: List[str], y: List[str], **kwargs):
        return self.compute_conditionned_likelihood(x, y, **kwargs)

    def likelihood_matrix(self) -> torch.Tensor:
        """
        :return: likelihood matrix : (world_size, num_candidates), likelihood[i, j] is the likelihood of
        candidate j being a summary for source text i.
        """
        likelihood_matrix = torch.zeros(
            (len(self.source_texts), len(self.candidates))
        ).to(self.device)

        pairs = []
        for i, source_text in enumerate(self.source_texts):
            for j, candidate in enumerate(self.candidates):
                pairs.append((i, j, source_text, candidate))

        # split the pairs into batches
        batches = [
            pairs[i: i + self.batch_size]
            for i in range(0, len(pairs), self.batch_size)
        ]

        for batch in tqdm(batches):
            # get the source texts and candidates
            source_texts = [pair[2] for pair in batch]
            candidates = [pair[3] for pair in batch]

            # compute the likelihoods
            with torch.no_grad():
                likelihoods = self.score(
                    source_texts, candidates, mean=True
                )

            # fill the matrix
            for k, (i, j, _, _) in enumerate(batch):
                likelihood_matrix[i, j] = likelihoods[k].detach()

        return likelihood_matrix

    @cache
    def S(self, t):
        if t == 0:
            return self.initial_speaker_probas
        else:
            listener = self.L(t - 1)
            prod = listener * self.rationality # + self.initial_speaker_probas.sum(0, keepdim=True)
            return torch.log_softmax(prod, dim=-1)

    @cache
    def L(self, t):
        speaker = self.S(t)
        return torch.log_softmax(speaker, dim=-2)

    def mk_listener_dataframe(self, t): ## add here computation of uniqueness scores (comment: mattia)
        self.initial_speaker_probas = self.likelihood_matrix()

        initial_listener_probas = self.L(0)

        # compute consensus
        uniform_distribution_over_source_texts = torch.ones_like(
            initial_listener_probas
        ) / len(self.source_texts)

        initital_consensuality_score = (
                torch.exp(initial_listener_probas)
                * (
                        initial_listener_probas - torch.log(uniform_distribution_over_source_texts)
                )
        ).sum(0).cpu().numpy()

        initital_consensuality_score = pd.Series(initital_consensuality_score, index=self.candidates)

        initial_listener_probas = initial_listener_probas.cpu().numpy()

        initial_listener_probas = pd.DataFrame(initial_listener_probas)
        initial_listener_probas.index = self.source_texts
        initial_listener_probas.columns = self.candidates

        initial_speaker_probas = self.S(0).cpu().numpy()
        initial_speaker_probas = pd.DataFrame(initial_speaker_probas)
        initial_speaker_probas.index = self.source_texts
        initial_speaker_probas.columns = self.candidates
        listener_probas = self.L(t)
        listener_df = pd.DataFrame(listener_probas.cpu().numpy())
        print(f"I have {listener_probas.shape} listener probabilities")

        consensuality_scores = (
                torch.exp(listener_probas)
                * (listener_probas - torch.log(uniform_distribution_over_source_texts))
        ).sum(0).cpu().numpy()
        print(f"I have {consensuality_scores.shape} consensuality scores")
        print(f"I have {len(self.candidates)} candidates")
        consensuality_scores = pd.Series(consensuality_scores, index=self.candidates)
        
        # Compute uniqueness score
        listener_probas_transposed = listener_probas.T 
        uniform_distribution = torch.ones_like(listener_probas_transposed) / len(self.source_texts)
        uniqueness_scores = kl_divergence(
            torch.exp(listener_probas_transposed),  # Convert log probabilities to probabilities
            uniform_distribution
        ).sum(0).cpu().numpy()##TODO: check code in deepseek. it does .sum(1). should it be .sum(0) on the transposed?
        uniqueness_scores = pd.Series(uniqueness_scores, index=self.candidates)
        print(f"I have {uniqueness_scores.shape} uniqueness scores")
        print(f"I have {len(self.candidates)} candidates")
        S = self.S(t).cpu().numpy()
        speaker_df = pd.DataFrame(S)

        # add the source texts and candidates as index

        listener_df.index = self.source_texts
        speaker_df.index = self.source_texts

        listener_df.columns = self.candidates
        speaker_df.columns = self.candidates

        

        return listener_df, speaker_df, initial_listener_probas, initial_speaker_probas, initital_consensuality_score, consensuality_scores, uniqueness_scores

    def rerank(self, t=1):
        """
        return the best summary (according to rsa) for each text
        """
        (
            listener_df,
            speaker_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
            uniqueness_scores
        ) = self.mk_listener_dataframe(t=t) #in this function you should compute the uniqueness scores
        n_unique_sentences = 5
        best_rsa = speaker_df.idxmax(axis=1).values
        best_base = initial_listener_proba.idxmax(axis=1).values
        top_n_unique_list = uniqueness_scores.nlargest(n_unique_sentences).index.tolist()
        top_n_common_list = uniqueness_scores.nsmallest(n_unique_sentences).index.tolist()
        best_uniqueness =list(set(top_n_unique_list).union(set(top_n_common_list))) #uniqueness_scores.idxmax().values ##should you remove axis=1
        #Glimpse unique is combination of top 5 unique and top 5 common.

        return (
            best_rsa,
            best_base,
            best_uniqueness,
            speaker_df,
            listener_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
            uniqueness_scores
        )


class RSARerankingEmbedder(RSAReranking):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_embeddings(self, x: List[str], y: List[str], **kwargs):
        model_kwargs = kwargs.get("model_kwargs")

        # shape: (batch_size, embedding_dim)
        x_embeddings = self.model.encode(x, **model_kwargs)
        y_embeddings = self.model.encode(y, **model_kwargs)

        # dot product between the embeddings : shape (batch_size)
        dot_products = (x_embeddings * y_embeddings).sum(-1)

        return dot_products

    def score(self, x: List[str], y: List[str], **kwargs):
        return self.compute_embeddings(x, y, **kwargs)



ciao da mattia!

In [3]:
import os
import sys
# Percorso dei file caricati
input_path = "/kaggle/input/myfiles/"

# Lista i file nella directory
print(os.listdir(input_path))
sys.path.append("/kaggle/input/myfiles/")

['rsasumm', 'data']


In [4]:
def prepare_dataset(dataset_path) -> Dataset:
    
    try:
        dataset = pd.read_csv(dataset_path)
    except:
        raise ValueError(f"Unknown dataset {dataset_path}")

    # make a dataset from the dataframe
    dataset = Dataset.from_pandas(dataset)

    return dataset

In [5]:
def evaluate_summarizer(dataset: Dataset) -> Dataset:
    """
    @param dataset: A dataset with the text
    @return: The same dataset with the summaries added
    """
    # create a dataset with the text and the summary

    # create a dataloader

    # generate summaries
    summaries = []
    print("Generating summaries...")

    # (tqdm library for progress bar) 
    for sample in tqdm(dataset):
        text = sample["text"] 
        
        text = text.replace('-----', '\n')
        sentences = nltk.sent_tokenize(text)
        # remove empty sentences
        sentences = [sentence for sentence in sentences if sentence != ""]

        summaries.append(sentences)

    # add summaries to the huggingface dataset
    dataset = dataset.map(lambda example: {"summary": summaries.pop(0)})

    return dataset

In [6]:
# Percorso della nuova cartella
data_dir = "/kaggle/working/data/candidates"

# Creare la cartella se non esiste
os.makedirs(data_dir, exist_ok=True)

### 1. Extract candidates

In [7]:
# load the dataset
limit = 10000
print("Loading dataset...")
dataset_path = "/kaggle/input/myfiles/data/processed/all_reviews_companies.csv"
dataset = prepare_dataset(dataset_path)

# limit the number of samples
if limit is not None:
    _lim = min(limit, len(dataset))
    dataset = dataset.select(range(_lim))

# generate summaries
dataset = evaluate_summarizer(
    dataset,
)

df_dataset = dataset.to_pandas()
df_dataset = df_dataset.explode("summary")
df_dataset = df_dataset.reset_index()
# add an idx with  the id of the summary for each example
df_dataset["id_candidate"] = df_dataset.groupby(["index"]).cumcount()

# save the dataset
# add unique date in name
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d-%H-%M-%S")
output_dir = "/kaggle/working/data/candidates/"
output_path = (
    Path(output_dir)
    / f"extractive_sentences-_-none-_-{date}.csv" #f"extractive_sentences-{date}.csv"
    
)
output_path = f"/kaggle/working/data/candidates/extractive_sentences-_-none-_-{date}.csv"

# create output dir if it doesn't exist
#if not output_path.parent.exists():
#    output_path.parent.mkdir(parents=True, exist_ok=True)

df_dataset.to_csv(output_path, index=False, encoding="utf-8")

# in case of scripted run, print the output path
print(f"output_path: {output_path}")

Loading dataset...
Generating summaries...


100%|██████████| 100/100 [00:00<00:00, 3269.19it/s]


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

output_path: /kaggle/working/data/candidates/extractive_sentences-_-none-_-2025-02-14-13-19-29.csv


In [8]:
df_dataset.iloc[1]["summary"]

'However, my recent experience has been frustrating.'

### 2. Compute RSA

In [9]:
from pathlib import Path

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusTokenizer
import argparse
from tqdm import tqdm

from pickle import dump
#from rsasumm.rsa_reranker import RSAReranking


In [10]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [11]:
debug = False
def parse_summaries(path: Path) -> pd.DataFrame:
    
    try:
        summaries = pd.read_csv(path)
    except:
        raise ValueError(f"Unknown dataset {path}")

    # check if the dataframe has the right columns
    if not all(
        col in summaries.columns for col in ["index", "id", "text", "gold", "summary", "id_candidate"]
    ):
        raise ValueError(
            "The dataframe must have columns ['index', 'id', 'text', 'gold', 'summary', 'id_candidate']"
        )

    return summaries


def compute_rsa(summaries: pd.DataFrame, model, tokenizer, device):
    results = []
    for name, group in tqdm(summaries.groupby(["id"])):
        print(name)
        if debug:
            print("---candidates---")
            print(group.summary.unique().tolist())
            print("---end candidates---")
            #print number of candidates
            print(f"number of candidates:  {len(group.summary.unique().tolist())}")
            #TODO: based on reviews_app.py at line 113, compute uniqueness scores
            #candidates = group.summary.unique().tolist()
            #speaker_df = speaker_df.applymap(lambda x: math.exp(x))
            #for candidate in candidates:
            #    get sentences of the candidate text_sentences=...
            #    text_1_summaries = speaker_df.loc[candidate][text_sentences]
        if not debug:
            #print(len(group.summary.unique().tolist()),group.summary.unique().tolist())
            rsa_reranker = RSAReranking(
                model,
                tokenizer,
                device=device,
                candidates=group.summary.unique().tolist(), #TODO: check what is this.
                source_texts=group.text.unique().tolist(),
                #batch_size=32,
                rationality=3,
            )
            (
                best_rsa,
                best_base,
                best_uniqueness,
                speaker_df,
                listener_df,
                initial_listener,
                language_model_proba_df,
                initial_consensuality_scores,
                consensuality_scores,
                uniqueness_scores,
            ) = rsa_reranker.rerank(t=2) #maybe you should return here the uniqueness scores
        
            gold = group['gold'].tolist()[0]
            results.append(
                {
                    "id": name,
                    "best_rsa": best_rsa,  # best speaker score
                    "best_base": best_base,  # naive baseline
                    "best_uniqueness": best_uniqueness,  # naive baseline
                    "speaker_df": speaker_df,  # all speaker results
                    "listener_df": listener_df,  # all listener results (chances of guessing correctly)
                    "initial_listener": initial_listener,
                    "language_model_proba_df": language_model_proba_df,
                    "initial_consensuality_scores": initial_consensuality_scores,
                    "consensuality_scores": consensuality_scores,  # uniqueness scores # TODO: did you write or it was already there? Answer: no, I did not write it. Ask professor how to extract it from the consensuality_scores
                    "uniqueness_scores": uniqueness_scores,  # uniqueness scores
                    "gold": gold,
                    "rationality": 3,  # hyperparameter
                    "text_candidates" : group
                }
            )
    if not debug:
        return results
    else:
        return None

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/pegasus-xsum"
summaries=output_path
# load the model and the tokenizer
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if "pegasus" in model_name: 
    print("Loading Pegasus Tokenizer")
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
else:
    print("Loading Auto Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model loaded")
model = model.to(device)
print("Model to device")
# load the summaries
summaries = parse_summaries(summaries)
print("Summaries loaded")
# rerank the summaries
print("Computing RSA...")
results = compute_rsa(summaries, model, tokenizer, device)
results = {"results": results} # wrap the results in a dictionary

results["metadata/reranking_model"] = model_name
results["metadata/rsa_iterations"] = 3



Loading model...


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Loading Pegasus Tokenizer


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Model loaded
Model to device
Summaries loaded
Computing RSA...


  0%|          | 0/10 [00:00<?, ?it/s]

('Audi',)
hello this is the test version!



  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:01<00:15,  1.12s/it][A
 20%|██        | 3/15 [00:01<00:07,  1.63it/s][A
 27%|██▋       | 4/15 [00:02<00:07,  1.57it/s][A
 33%|███▎      | 5/15 [00:03<00:06,  1.52it/s][A
 40%|████      | 6/15 [00:04<00:07,  1.26it/s][A
 47%|████▋     | 7/15 [00:05<00:06,  1.19it/s][A
 53%|█████▎    | 8/15 [00:06<00:05,  1.30it/s][A
 60%|██████    | 9/15 [00:06<00:04,  1.33it/s][A
 67%|██████▋   | 10/15 [00:07<00:03,  1.40it/s][A
 73%|███████▎  | 11/15 [00:08<00:02,  1.34it/s][A
 80%|████████  | 12/15 [00:09<00:02,  1.15it/s][A
 87%|████████▋ | 13/15 [00:10<00:01,  1.09it/s][A
 93%|█████████▎| 14/15 [00:11<00:00,  1.05it/s][A
100%|██████████| 15/15 [00:12<00:00,  1.22it/s][A
 10%|█         | 1/10 [00:12<01:53, 12.56s/it]

I have torch.Size([10, 46]) listener probabilities
I have (46,) consensuality scores
I have 46 candidates
I have (46,) uniqueness scores
I have 46 candidates
('Avanti Travel Insurance',)
hello this is the test version!



  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:03,  2.62it/s][A
 27%|██▋       | 3/11 [00:01<00:04,  1.82it/s][A
 36%|███▋      | 4/11 [00:02<00:04,  1.69it/s][A
 45%|████▌     | 5/11 [00:02<00:03,  1.64it/s][A
 55%|█████▍    | 6/11 [00:03<00:03,  1.64it/s][A
 64%|██████▎   | 7/11 [00:04<00:02,  1.62it/s][A
 73%|███████▎  | 8/11 [00:04<00:01,  1.63it/s][A
 82%|████████▏ | 9/11 [00:05<00:01,  1.54it/s][A
 91%|█████████ | 10/11 [00:06<00:00,  1.49it/s][A
100%|██████████| 11/11 [00:06<00:00,  1.60it/s][A
 20%|██        | 2/10 [00:19<01:16,  9.51s/it]

I have torch.Size([10, 35]) listener probabilities
I have (35,) consensuality scores
I have 35 candidates
I have (35,) uniqueness scores
I have 35 candidates
('Danske Bank',)
hello this is the test version!



  0%|          | 0/10 [00:00<?, ?it/s][A
 20%|██        | 2/10 [00:01<00:04,  1.94it/s][A
 30%|███       | 3/10 [00:01<00:03,  1.93it/s][A
 40%|████      | 4/10 [00:02<00:03,  1.65it/s][A
 50%|█████     | 5/10 [00:03<00:03,  1.51it/s][A
 60%|██████    | 6/10 [00:03<00:02,  1.56it/s][A
 70%|███████   | 7/10 [00:04<00:01,  1.59it/s][A
 80%|████████  | 8/10 [00:05<00:01,  1.48it/s][A
 90%|█████████ | 9/10 [00:05<00:00,  1.41it/s][A
100%|██████████| 10/10 [00:06<00:00,  1.55it/s][A
 30%|███       | 3/10 [00:26<00:57,  8.19s/it]

I have torch.Size([10, 30]) listener probabilities
I have (30,) consensuality scores
I have 30 candidates
I have (30,) uniqueness scores
I have 30 candidates
('Google',)
hello this is the test version!



  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:00<00:01,  9.02it/s][A
 13%|█▎        | 2/15 [00:01<00:07,  1.65it/s][A
 20%|██        | 3/15 [00:02<00:10,  1.18it/s][A
 27%|██▋       | 4/15 [00:03<00:09,  1.14it/s][A
 33%|███▎      | 5/15 [00:03<00:07,  1.37it/s][A
 40%|████      | 6/15 [00:04<00:06,  1.48it/s][A
 47%|████▋     | 7/15 [00:04<00:05,  1.48it/s][A
 53%|█████▎    | 8/15 [00:05<00:04,  1.49it/s][A
 60%|██████    | 9/15 [00:06<00:03,  1.53it/s][A
 67%|██████▋   | 10/15 [00:06<00:03,  1.46it/s][A
 73%|███████▎  | 11/15 [00:07<00:02,  1.41it/s][A
 80%|████████  | 12/15 [00:08<00:01,  1.57it/s][A
 87%|████████▋ | 13/15 [00:08<00:01,  1.53it/s][A
 93%|█████████▎| 14/15 [00:09<00:00,  1.50it/s][A
100%|██████████| 15/15 [00:10<00:00,  1.48it/s][A
 40%|████      | 4/10 [00:36<00:53,  8.96s/it]

I have torch.Size([10, 45]) listener probabilities
I have (45,) consensuality scores
I have 45 candidates
I have (45,) uniqueness scores
I have 45 candidates
('IKEA',)
hello this is the test version!



  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [00:00<00:03,  4.89it/s][A
 10%|█         | 2/20 [00:01<00:11,  1.64it/s][A
 15%|█▌        | 3/20 [00:02<00:16,  1.03it/s][A
 20%|██        | 4/20 [00:03<00:16,  1.02s/it][A
 25%|██▌       | 5/20 [00:04<00:16,  1.09s/it][A
 30%|███       | 6/20 [00:05<00:13,  1.01it/s][A
 35%|███▌      | 7/20 [00:06<00:12,  1.04it/s][A
 40%|████      | 8/20 [00:07<00:10,  1.17it/s][A
 45%|████▌     | 9/20 [00:07<00:09,  1.18it/s][A
 50%|█████     | 10/20 [00:08<00:07,  1.25it/s][A
 55%|█████▌    | 11/20 [00:09<00:08,  1.07it/s][A
 60%|██████    | 12/20 [00:10<00:07,  1.02it/s][A
 65%|██████▌   | 13/20 [00:12<00:07,  1.05s/it][A
 70%|███████   | 14/20 [00:12<00:05,  1.09it/s][A
 75%|███████▌  | 15/20 [00:13<00:04,  1.11it/s][A
 80%|████████  | 16/20 [00:14<00:03,  1.15it/s][A
 85%|████████▌ | 17/20 [00:15<00:02,  1.14it/s][A
 90%|█████████ | 18/20 [00:16<00:01,  1.24it/s][A
 95%|█████████▌| 19/20 [00:17<00:00,  1.03it/s]

I have torch.Size([10, 63]) listener probabilities
I have (63,) consensuality scores
I have 63 candidates
I have (63,) uniqueness scores
I have 63 candidates
('Just Eat',)
hello this is the test version!



  0%|          | 0/17 [00:00<?, ?it/s][A
 12%|█▏        | 2/17 [00:01<00:07,  1.95it/s][A
 18%|█▊        | 3/17 [00:01<00:08,  1.67it/s][A
 24%|██▎       | 4/17 [00:02<00:09,  1.33it/s][A
 29%|██▉       | 5/17 [00:03<00:08,  1.34it/s][A
 35%|███▌      | 6/17 [00:04<00:09,  1.17it/s][A
 41%|████      | 7/17 [00:05<00:09,  1.08it/s][A
 47%|████▋     | 8/17 [00:06<00:07,  1.13it/s][A
 53%|█████▎    | 9/17 [00:07<00:07,  1.07it/s][A
 59%|█████▉    | 10/17 [00:08<00:05,  1.18it/s][A
 65%|██████▍   | 11/17 [00:09<00:05,  1.14it/s][A
 71%|███████   | 12/17 [00:10<00:04,  1.12it/s][A
 76%|███████▋  | 13/17 [00:11<00:03,  1.05it/s][A
 82%|████████▏ | 14/17 [00:12<00:03,  1.08s/it][A
 88%|████████▊ | 15/17 [00:13<00:02,  1.04s/it][A
 94%|█████████▍| 16/17 [00:14<00:00,  1.09it/s][A
100%|██████████| 17/17 [00:14<00:00,  1.14it/s][A
 60%|██████    | 6/10 [01:11<00:54, 13.54s/it]

I have torch.Size([10, 52]) listener probabilities
I have (52,) consensuality scores
I have 52 candidates
I have (52,) uniqueness scores
I have 52 candidates
('Lidl GB',)
hello this is the test version!



  0%|          | 0/16 [00:00<?, ?it/s][A
 12%|█▎        | 2/16 [00:00<00:06,  2.01it/s][A
 19%|█▉        | 3/16 [00:02<00:09,  1.32it/s][A
 25%|██▌       | 4/16 [00:03<00:09,  1.21it/s][A
 31%|███▏      | 5/16 [00:04<00:10,  1.05it/s][A
 38%|███▊      | 6/16 [00:05<00:10,  1.02s/it][A
 44%|████▍     | 7/16 [00:06<00:08,  1.10it/s][A
 50%|█████     | 8/16 [00:06<00:07,  1.12it/s][A
 56%|█████▋    | 9/16 [00:08<00:07,  1.00s/it][A
 62%|██████▎   | 10/16 [00:09<00:06,  1.01s/it][A
 69%|██████▉   | 11/16 [00:10<00:06,  1.21s/it][A
 75%|███████▌  | 12/16 [00:12<00:05,  1.37s/it][A
 81%|████████▏ | 13/16 [00:13<00:03,  1.21s/it][A
 88%|████████▊ | 14/16 [00:14<00:02,  1.21s/it][A
 94%|█████████▍| 15/16 [00:15<00:01,  1.07s/it][A
100%|██████████| 16/16 [00:16<00:00,  1.03s/it][A
 70%|███████   | 7/10 [01:28<00:44, 14.73s/it]

I have torch.Size([10, 50]) listener probabilities
I have (50,) consensuality scores
I have 50 candidates
I have (50,) uniqueness scores
I have 50 candidates
('Perfume Click',)
hello this is the test version!



  0%|          | 0/11 [00:00<?, ?it/s][A
 18%|█▊        | 2/11 [00:00<00:04,  2.13it/s][A
 27%|██▋       | 3/11 [00:01<00:04,  1.62it/s][A
 36%|███▋      | 4/11 [00:02<00:05,  1.28it/s][A
 45%|████▌     | 5/11 [00:03<00:05,  1.11it/s][A
 55%|█████▍    | 6/11 [00:04<00:04,  1.06it/s][A
 64%|██████▎   | 7/11 [00:06<00:04,  1.00s/it][A
 73%|███████▎  | 8/11 [00:06<00:02,  1.05it/s][A
 82%|████████▏ | 9/11 [00:07<00:01,  1.06it/s][A
 91%|█████████ | 10/11 [00:08<00:00,  1.01it/s][A
100%|██████████| 11/11 [00:09<00:00,  1.10it/s][A
 80%|████████  | 8/10 [01:38<00:26, 13.31s/it]

I have torch.Size([10, 33]) listener probabilities
I have (33,) consensuality scores
I have 33 candidates
I have (33,) uniqueness scores
I have 33 candidates
('The LEGO Group',)
hello this is the test version!



  0%|          | 0/20 [00:00<?, ?it/s][A
 10%|█         | 2/20 [00:00<00:07,  2.36it/s][A
 15%|█▌        | 3/20 [00:02<00:13,  1.25it/s][A
 20%|██        | 4/20 [00:03<00:16,  1.05s/it][A
 25%|██▌       | 5/20 [00:04<00:16,  1.12s/it][A
 30%|███       | 6/20 [00:05<00:13,  1.01it/s][A
 35%|███▌      | 7/20 [00:06<00:12,  1.04it/s][A
 40%|████      | 8/20 [00:07<00:11,  1.01it/s][A
 45%|████▌     | 9/20 [00:08<00:11,  1.00s/it][A
 50%|█████     | 10/20 [00:09<00:08,  1.13it/s][A
 55%|█████▌    | 11/20 [00:10<00:09,  1.08s/it][A
 60%|██████    | 12/20 [00:12<00:09,  1.21s/it][A
 65%|██████▌   | 13/20 [00:13<00:09,  1.31s/it][A
 70%|███████   | 14/20 [00:14<00:06,  1.14s/it][A
 75%|███████▌  | 15/20 [00:15<00:06,  1.22s/it][A
 80%|████████  | 16/20 [00:17<00:05,  1.29s/it][A
 85%|████████▌ | 17/20 [00:18<00:04,  1.35s/it][A
 90%|█████████ | 18/20 [00:20<00:02,  1.27s/it][A
 95%|█████████▌| 19/20 [00:21<00:01,  1.20s/it][A
100%|██████████| 20/20 [00:21<00:00,  1.09s/it]

I have torch.Size([10, 62]) listener probabilities
I have (62,) consensuality scores
I have 62 candidates
I have (62,) uniqueness scores
I have 62 candidates
('UberEATS',)
hello this is the test version!



  0%|          | 0/18 [00:00<?, ?it/s][A
 11%|█         | 2/18 [00:00<00:07,  2.03it/s][A
 17%|█▋        | 3/18 [00:02<00:10,  1.36it/s][A
 22%|██▏       | 4/18 [00:02<00:10,  1.31it/s][A
 28%|██▊       | 5/18 [00:03<00:11,  1.17it/s][A
 33%|███▎      | 6/18 [00:04<00:09,  1.21it/s][A
 39%|███▉      | 7/18 [00:05<00:09,  1.15it/s][A
 44%|████▍     | 8/18 [00:06<00:09,  1.10it/s][A
 50%|█████     | 9/18 [00:07<00:09,  1.02s/it][A
 56%|█████▌    | 10/18 [00:09<00:08,  1.10s/it][A
 61%|██████    | 11/18 [00:09<00:06,  1.00it/s][A
 67%|██████▋   | 12/18 [00:10<00:05,  1.03it/s][A
 72%|███████▏  | 13/18 [00:11<00:04,  1.11it/s][A
 78%|███████▊  | 14/18 [00:12<00:03,  1.09it/s][A
 83%|████████▎ | 15/18 [00:13<00:02,  1.09it/s][A
 89%|████████▉ | 16/18 [00:14<00:01,  1.10it/s][A
 94%|█████████▍| 17/18 [00:15<00:01,  1.14s/it][A
100%|██████████| 18/18 [00:17<00:00,  1.02it/s][A
100%|██████████| 10/10 [02:19<00:00, 13.91s/it]

I have torch.Size([10, 56]) listener probabilities
I have (56,) consensuality scores
I have 56 candidates
I have (56,) uniqueness scores
I have 56 candidates





In [12]:
summaries

Unnamed: 0,index,id,text,gold,summary,id_candidate
0,0,Danske Bank,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,0
1,0,Danske Bank,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,"However, my recent experience has been frustra...",1
2,0,Danske Bank,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,I needed a temporary overdraft extension of ju...,2
3,0,Danske Bank,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,What should have been a simple request turned ...,3
4,0,Danske Bank,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,"Each representative seemed disconnected, and I...",4
...,...,...,...,...,...,...
469,99,UberEATS,Absolutely atrocious customer service - they s...,Customers express widespread dissatisfaction w...,And I don&apos;t mean the drivers who have bee...,4
470,99,UberEATS,Absolutely atrocious customer service - they s...,Customers express widespread dissatisfaction w...,Kafkaesque.,5
471,99,UberEATS,Absolutely atrocious customer service - they s...,Customers express widespread dissatisfaction w...,"Just Eat is infinitely better, as is Foodhub -...",6
472,99,UberEATS,Absolutely atrocious customer service - they s...,Customers express widespread dissatisfaction w...,"No chat, no phone, no email.",7


In [13]:
df_results = pd.DataFrame(results["results"])
df_results


Unnamed: 0,id,best_rsa,best_base,best_uniqueness,speaker_df,listener_df,initial_listener,language_model_proba_df,initial_consensuality_scores,consensuality_scores,uniqueness_scores,gold,rationality,text_candidates
0,"(Audi,)",[They didn’t leave the wheel nut lock key in t...,[They didn’t leave the wheel nut lock key in t...,[I hope to own one from this globally successf...,...,...,...,...,Audi is a car like no other. ...,Audi is a car like no other. ...,Audi is a car like no other. ...,Customers are generally very satisfied with th...,3,index id ...
1,"(Avanti Travel Insurance,)",[Just renewed my holiday insurance with Avanti...,[Just renewed my holiday insurance with Avanti...,[Having received an on-line quote I was unable...,...,...,...,...,Just renewed my holiday insurance with Avanti ...,Just renewed my holiday insurance with Avanti ...,Just renewed my holiday insurance with Avanti ...,Customers are generally very satisfied with Av...,3,index id \ 383 ...
2,"(Danske Bank,)",[Due to this impersonal and inefficient servic...,[Due to this impersonal and inefficient servic...,[I needed a temporary overdraft extension of j...,...,...,...,...,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Review for Danske Bank\n\nRating: ★☆☆☆\n\nAs a...,Customers are generally very satisfied with th...,3,index id ...
3,"(Google,)","[I highly recommend you don’t either., We have...","[I highly recommend you don’t either., We have...",[Google itself acknowledges the issue and even...,...,...,...,...,My Google Pixel 8 developed screen issues 9 mo...,My Google Pixel 8 developed screen issues 9 mo...,My Google Pixel 8 developed screen issues 9 mo...,Customers have expressed mixed sentiments abou...,3,index id ...
4,"(IKEA,)",[Had a few issues to get sorted so call ikea h...,[Had a few issues to get sorted so call ikea h...,[To add had a lovely conversation with her at ...,...,...,...,...,Had a few issues to get sorted so call ikea he...,Had a few issues to get sorted so call ikea he...,Had a few issues to get sorted so call ikea he...,Customers are largely dissatisfied with their ...,3,index id ...
5,"(Just Eat,)","[Do not use., My order was not delivered so I ...","[Do not use., They don’t deserve 1* but wouldn...","[This company shouldn’t be trading!, No rxplan...",...,...,...,...,They don’t deserve 1* but wouldn’t allow less....,They don’t deserve 1* but wouldn’t allow less....,They don’t deserve 1* but wouldn’t allow less....,Customers are largely dissatisfied with Just E...,3,index id ...
6,"(Lidl GB,)",[The customer service at this store is excell...,[The customer service at this store is excell...,[Not convinced about the quality of some of th...,...,...,...,...,The customer service at this store is excelle...,The customer service at this store is excelle...,The customer service at this store is excelle...,Customers are largely dissatisfied with this b...,3,index id ...
7,"(Perfume Click,)",[this company is more than happy to rip you of...,[this company is more than happy to rip you of...,"[Thank you for such a wide range available, I ...",...,...,...,...,I love my purchases that arrived by the time i...,I love my purchases that arrived by the time i...,I love my purchases that arrived by the time i...,Customers are extremely satisfied with this co...,3,index id ...
8,"(The LEGO Group,)","[‘What are the codes’….., They had the AUDACIT...","[‘What are the codes’….., They had the AUDACIT...","[Sarah Watkins, Missing a brick from bag 13 of...",...,...,...,...,Missing a brick from bag 13 of Lego Titanic. ...,Missing a brick from bag 13 of Lego Titanic. ...,Missing a brick from bag 13 of Lego Titanic. ...,Customers are generally unhappy with their exp...,3,index id ...
9,"(UberEATS,)","[I contacted uber support, and they told me to...","[I contacted uber support, and they told me to...","[which I honestly think they didn&apos;t do., ...",...,...,...,...,Please don&apos;t order from restaurants that ...,Please don&apos;t order from restaurants that ...,Please don&apos;t order from restaurants that ...,Customers express widespread dissatisfaction w...,3,index id ...


In [14]:
for entry in results["results"]:
    print(f"Documento ID: {entry['id']}")
    print(f"Riassunto Generato (best_rsa): {entry['best_rsa']}")
    print(f"Riassunto Gold: {entry['gold']}")
    print(f"Punteggio RSA: {entry['best_rsa']}")
    print("-" * 80)


Documento ID: ('Audi',)
Riassunto Generato (best_rsa): ['They didn’t leave the wheel nut lock key in the car and the service centre doesn’t answer the phone and main number just keeps putting through to them and said the best thing is for me to go and collect it in person - so I have to waste at least 45 minutes of my time to do a round trip because they hadn’t returned this routine item.'
 'Very professional service!'
 'I purchased a A1 from this dealership which from the outset was faulty.'
 'Highly recommend.' 'Over 2 years later still awaiting both.'
 'In July 2024, I sold my Audi Q5 bought in 2015 in Abu Dhabi.'
 'Visited many different branches and the guys here were the best by far.'
 'coolant water tank, stench in the cabin emanating from the engine space, etc.'
 'Shout out to Sam, Hannah and Jimmy.'
 'The introduction was sufficient but the Audi connect and app could not be set.']
Riassunto Gold: Customers are generally very satisfied with this company. 

Customers mention tha

## Evaluation - ROUGE

In [15]:
!pip install rouge-score
from rouge_score import rouge_scorer

def evaluate_rouge(df):
    # make a list of the tuples (text, summary)

    texts = df.gold.tolist()
    summaries = df.best_rsa.tolist()

    # rouges
    metrics = {"rouge1": [], "rouge2": [], "rougeL": [], "rougeLsum": []}

    rouges = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True)
    
    metrics["rouge1"].extend(
        [
            rouges.score("".join(summary), text)["rouge1"].fmeasure
            for summary, text in zip(summaries, texts)
        ]
    )
    metrics["rouge2"].extend(
        [
            rouges.score("".join(summary), text)["rouge2"].fmeasure
            for summary, text in zip(summaries, texts)
        ]
    )
    metrics["rougeL"].extend(
        [
            rouges.score("".join(summary), text)["rougeL"].fmeasure
            for summary, text in zip(summaries, texts)
        ]
    )
    metrics["rougeLsum"].extend(
        [
            rouges.score("".join(summary), text)["rougeLsum"].fmeasure
            for summary, text in zip(summaries, texts)
        ]
    )

    return metrics

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=b8b63240d46ebdec3d3f187c542fecad9b99e78f36c6315458ef49a2a884e749
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [16]:
df_results
texts = df_results.gold.tolist()
summaries = df_results.best_rsa.tolist()
summaries
metrics = {"rouge1": [], "rouge2": [], "rougeL": [], "rougeLsum": []}

rouges = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True)
    
metrics["rouge1"].extend(
    [
        rouges.score("".join(summary), text)["rouge1"].fmeasure
        for summary, text in zip(summaries, texts)
    ]
)

In [17]:
metrics = evaluate_rouge(df_results)
df_metrics = pd.DataFrame.from_dict(metrics)
print(df_metrics)
df_metrics.mean(axis=0)

     rouge1    rouge2    rougeL  rougeLsum
0  0.269663  0.015094  0.127341   0.142322
1  0.244275  0.015385  0.137405   0.167939
2  0.341772  0.044586  0.145570   0.183544
3  0.092141  0.000000  0.081301   0.070461
4  0.243386  0.000000  0.105820   0.116402
5  0.251163  0.037559  0.130233   0.176744
6  0.176991  0.011869  0.088496   0.094395
7  0.123563  0.005764  0.094828   0.091954
8  0.188679  0.000000  0.113208   0.125786
9  0.127660  0.010753  0.095745   0.095745


rouge1       0.205929
rouge2       0.014101
rougeL       0.111994
rougeLsum    0.126529
dtype: float64

## BERTSCORE

In [18]:
! pip install bert-score
from bert_score import BERTScorer
def evaluate_bartbert(df, device="cuda"):
    # make a list of the tuples (text, summary)

    texts = df.gold.tolist()
    summaries = df.best_rsa.tolist()

    scorer = BERTScorer(lang="en", rescale_with_baseline=False, device=device)

    metrics = {'BERTScore': []}
    for i in range(len(texts)):
        texts[i] = texts[i].replace("\n", " ")
        summ= "".join(summaries[i])
        summ = summ.replace("\n", " ")

        P, R, F1 = scorer.score([summ], [texts[i]])

        metrics['BERTScore'].append(F1.mean().item())

    # compute the mean of the metrics
    # metrics = {k: sum(v) / len(v) for k, v in metrics.items()}

    return metrics

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [19]:
metrics = evaluate_bartbert(df_results)
# make a dataframe with the metric
df_bert_metrics = pd.DataFrame(metrics)
print(df_bert_metrics)
df_bert_metrics.mean(axis=0)
#this happens because we are comparing two complete paragraphs. We should do it on a sentence level and maybe get a similarity matrix and see if each sentence find a twin.

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   BERTScore
0   0.822756
1   0.829242
2   0.849976
3   0.800878
4   0.834533
5   0.832915
6   0.816143
7   0.765681
8   0.817703
9   0.819433


BERTScore    0.818926
dtype: float64

In [20]:
from nltk.tokenize import sent_tokenize


def evaluate_bertscore(df, device="cuda"):
    # make a list of the tuples (text, summary)

    texts = df.gold.tolist()
    summaries = df.best_rsa.tolist()
    scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    bert_scores = []
    for text,summary in zip(texts,summaries):
        text_sentences = sent_tokenize(text)
        summary_sentences = summary
        scores = np.zeros((len(text_sentences), len(summary_sentences)))
        for i, sent1 in enumerate(text_sentences):
            for j, sent2 in enumerate(summary_sentences):
                P, R, F1 = scorer.score([sent1], [sent2])
                scores[i, j] = P.item()
        precision= np.max(scores, axis=1).mean()
        bert_scores.append(precision)
    return np.array(bert_scores)
metrics = evaluate_bertscore(df_results)
# make a dataframe with the metric
df_bert_metrics = pd.DataFrame(metrics)
print(df_bert_metrics)
df_bert_metrics.mean(axis=0)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


          0
0  0.254969
1  0.216895
2  0.359022
3  0.291223
4  0.231544
5  0.264558
6  0.228156
7  0.317096
8  0.302271
9  0.283428


0    0.274916
dtype: float64

## SEAHORSE

In [21]:
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.data
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
map_questionnumber_to_question = {
    "question1": "SHMetric/Comprehensible",
    "question2": "SHMetric/Repetition",
    "question3": "SHMetric/Grammar",
    "question4": "SHMetric/Attribution",
    "question5": "SHMetric/Main ideas",
    "question6": "SHMetric/Conciseness",
}
def evaluate_classification_task(model, tokenizer, question, df, batch_size):

    texts = df.gold.tolist()
    summaries = df.best_rsa.tolist()
    template = "premise: {premise} hypothesis: {hypothesis}"
    ds = [template.format(premise=text[:20*1024], hypothesis="".join(summary)) for text, summary in zip(texts, summaries)]


    eval_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size)

    metrics = {f"{question}/proba_1": [], f"{question}/proba_0": [], f"{question}/guess": []}

    with torch.no_grad():
        for batch in tqdm(eval_loader):
            # tokenize the batch
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            # move the inputs to the device
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            N_inputs = inputs["input_ids"].shape[0]
            # make decoder inputs to be <pad>
            decoder_input_ids = torch.full((N_inputs, 1), tokenizer.pad_token_id, dtype=torch.long, device=model.device)

            outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits
            # retrieve logits for the last token and the scores for 0 and 1
            logits = logits[:, -1, [497, 333]]

            # compute the probabilities
            probs = F.softmax(logits, dim=-1)

            # compute the guess
            guess = probs.argmax(dim=-1)

            # append the metrics
            metrics[f"{question}/proba_1"].extend(probs[:, 1].tolist())
            metrics[f"{question}/proba_0"].extend(probs[:, 0].tolist())
            metrics[f"{question}/guess"].extend(guess.tolist())

    # average the metrics

    # metrics = {k: sum(v) / len(v) for k, v in metrics.items()}

    return metrics

In [22]:
metric_seahorse = {}
for index,question in enumerate(map_questionnumber_to_question.values()):
    model_name = f"google/seahorse-large-q{index+1}"
    print(question)   
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.float16)

    tokenizer = AutoTokenizer.from_pretrained(model_name)


    metrics = evaluate_classification_task(model, tokenizer, question, df_results, 16)
    metric_seahorse[question] = metrics[f"{question}/guess"]
metric_seahorse

SHMetric/Comprehensible


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 1/1 [00:02<00:00,  2.37s/it]


SHMetric/Repetition


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:02<00:00,  2.03s/it]

SHMetric/Grammar





config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:02<00:00,  2.04s/it]


SHMetric/Attribution


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:02<00:00,  2.06s/it]


SHMetric/Main ideas


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:02<00:00,  2.05s/it]


SHMetric/Conciseness


config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:02<00:00,  2.06s/it]


{'SHMetric/Comprehensible': [1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
 'SHMetric/Repetition': [1, 1, 1, 0, 1, 1, 0, 0, 1, 1],
 'SHMetric/Grammar': [0, 1, 1, 0, 1, 0, 0, 0, 1, 1],
 'SHMetric/Attribution': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'SHMetric/Main ideas': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'SHMetric/Conciseness': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [23]:
df_seahorse_metrics = pd.DataFrame(metric_seahorse)
df_seahorse_metrics
df_seahorse_metrics.mean(axis=0)

SHMetric/Comprehensible    0.9
SHMetric/Repetition        0.7
SHMetric/Grammar           0.5
SHMetric/Attribution       0.0
SHMetric/Main ideas        0.0
SHMetric/Conciseness       0.0
dtype: float64

## TEST ROBERTA to detect linguistically acceptable sentences

In [24]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForCausalLM, RobertaTokenizer
import torch
import re
tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")

def check_if_sentence(text):
    
    text = text.strip()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    output = model(**inputs)
    model_output = bool(output.logits.argmax(dim=-1).item())
    print(output.logits.argmax(dim=-1).item())
    sentence_start = text[0].isupper()
    sentence_end = bool(re.search(r'[.!?]$', text))

    if (model_output):
        return True
    else:
        return False
sentence = "Here are some comments"
check_if_sentence(sentence) 


Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1


True

In [26]:
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import PyTorchModelHubMixin



class QualityModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(QualityModel, self).__init__()
        self.model = AutoModel.from_pretrained(config["base_model"])
        self.dropout = nn.Dropout(config["fc_dropout"])
        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))

    def forward(self, input_ids, attention_mask):
        features = self.model(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        dropped = self.dropout(features)
        outputs = self.fc(dropped)
        return torch.softmax(outputs[:, 0, :], dim=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained("nvidia/quality-classifier-deberta")
tokenizer = AutoTokenizer.from_pretrained("nvidia/quality-classifier-deberta")
model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(device)
model.eval()


# Prepare and process inputs
text_samples = ["I think that the paper is very well written, I like it","The authors localized a phenomenon and demonstrated how to exploit it.","I trust the results because I performed exactly the same experiments for CIFAR-10 with longer non-regularization periods and found that there is no effect (this is also that the authors show in the paper)  but I didn't test on other datasets and obviously didn't think about potential benefits for compression.",".?@fdsa Low quality text.", "I like pizza because it provides several nutritients while being tasty","To tell means express something in words.","We concluded in the previous section that the classifiers output a different kind of informativeness than the human annotations.","Here are some comments","Still, extractive summarization methods are notably sensitive to the sentence segmentation process"," which can occasionally result in peculiar outcomes"]
inputs = tokenizer(
    text_samples, return_tensors="pt", padding="longest", truncation=True
).to(device)
outputs = model(inputs["input_ids"], inputs["attention_mask"])

# Predict and display results
predicted_classes = torch.argmax(outputs, dim=1)
predicted_domains = [
    config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()
]
print(predicted_domains)
print(predicted_classes)
print(outputs)
quality_scores_raw = outputs.cpu().detach().numpy()
#to get a quality score, we can sum the probability of "High" and 0.7 times the probability of "Medium". This is a simple heuristic to get a score between 0 and 1. "Medium" is multiplied by 0.7 to give it less importance than "High".
quality_scores = quality_scores_raw[:,0]+ 0.7*quality_scores_raw[:,1]
quality_scores

config.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/735M [00:00<?, ?B/s]

['Medium', 'High', 'Medium', 'Low', 'Medium', 'Medium', 'High', 'Medium', 'Medium', 'Medium']
tensor([1, 0, 1, 2, 1, 1, 0, 1, 1, 1], device='cuda:0')
tensor([[6.7445e-03, 8.8790e-01, 1.0536e-01],
        [4.9476e-01, 4.5453e-01, 5.0715e-02],
        [3.4836e-02, 9.5611e-01, 9.0515e-03],
        [5.8394e-04, 2.6585e-02, 9.7283e-01],
        [2.8301e-02, 9.4528e-01, 2.6421e-02],
        [1.6381e-01, 7.4715e-01, 8.9036e-02],
        [6.1041e-01, 3.5894e-01, 3.0653e-02],
        [4.0138e-03, 8.0881e-01, 1.8718e-01],
        [3.4625e-01, 6.0633e-01, 4.7417e-02],
        [1.7532e-01, 7.5197e-01, 7.2714e-02]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


array([0.6282743 , 0.812927  , 0.7041147 , 0.01919315, 0.6899959 ,
       0.68681777, 0.8616661 , 0.57018   , 0.7706837 , 0.7016957 ],
      dtype=float32)