In [219]:
import argparse
import datetime
from pathlib import Path

import pandas as pd
from datasets import Dataset
from tqdm import tqdm

import nltk
import os

In [220]:
def prepare_dataset(dataset_path) -> Dataset:
    
    try:
        dataset = pd.read_csv(dataset_path)
    except:
        raise ValueError(f"Unknown dataset {dataset_path}")

    # make a dataset from the dataframe
    dataset = Dataset.from_pandas(dataset)

    return dataset

In [221]:
def evaluate_summarizer(dataset: Dataset) -> Dataset:
    """
    @param dataset: A dataset with the text
    @return: The same dataset with the summaries added
    """
    # create a dataset with the text and the summary

    # create a dataloader

    # generate summaries
    summaries = []
    print("Generating summaries...")

    # (tqdm library for progress bar) 
    for sample in tqdm(dataset):
        text = sample["text"] 
        
        text = text.replace('-----', '\n')
        sentences = nltk.sent_tokenize(text)
        # remove empty sentences
        sentences = [sentence for sentence in sentences if sentence != ""]

        summaries.append(sentences)

    # add summaries to the huggingface dataset
    dataset = dataset.map(lambda example: {"summary": summaries.pop(0)})

    return dataset

### 1. Extract candidates

In [224]:
# load the dataset
limit = 10
print("Loading dataset...")
dataset_path = "../../data/processed/all_reviews_2017.csv"
dataset = prepare_dataset(dataset_path)

# limit the number of samples
if limit is not None:
    _lim = min(limit, len(dataset))
    dataset = dataset.select(range(_lim))

# generate summaries
dataset = evaluate_summarizer(
    dataset,
)

df_dataset = dataset.to_pandas()
df_dataset = df_dataset.explode("summary")
df_dataset = df_dataset.reset_index()
# add an idx with  the id of the summary for each example
df_dataset["id_candidate"] = df_dataset.groupby(["index"]).cumcount()

# save the dataset
# add unique date in name
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d-%H-%M-%S")
output_dir = "data/candidates/"
output_path = (
    Path(output_dir)
    / f"extractive_sentences-_-none-_-{date}.csv" #f"extractive_sentences-{date}.csv"
    
)
output_path = f"../../data/candidates/extractive_sentences-_-none-_-{date}.csv"

# create output dir if it doesn't exist
#if not output_path.parent.exists():
#    output_path.parent.mkdir(parents=True, exist_ok=True)

df_dataset.to_csv(output_path, index=False, encoding="utf-8")

# in case of scripted run, print the output path
print(f"output_path: {output_path}")

Loading dataset...
Generating summaries...


100%|██████████| 6/6 [00:00<00:00, 1930.49it/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 1704.54 examples/s]

output_path: ../../data/candidates/extractive_sentences-_-none-_-2024-12-19-16-57-12.csv





In [225]:
df_dataset

Unnamed: 0,index,id,text,gold,summary,id_candidate
0,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,Summary: The paper presents low-rank bilinear ...,0
1,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper implements low-rank bilinear pooling...,1
2,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper presents various ablation studies of...,2
3,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,-Strengths:\n\n\n-1.,3
4,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper presents new insights into element-w...,4
...,...,...,...,...,...,...
112,5,https://openreview.net/forum?id=S1J0E-71l,"This paper proposes to leverage ""surprisal"" as...","Based on the feedback, I'm going to be rejecti...",-- Author only compares to methods that do not...,14
113,5,https://openreview.net/forum?id=S1J0E-71l,"This paper proposes to leverage ""surprisal"" as...","Based on the feedback, I'm going to be rejecti...",I would argue that a comparison with dynamic e...,15
114,5,https://openreview.net/forum?id=S1J0E-71l,"This paper proposes to leverage ""surprisal"" as...","Based on the feedback, I'm going to be rejecti...",--- Feedback LSTM uses prediction error as ext...,16
115,5,https://openreview.net/forum?id=S1J0E-71l,"This paper proposes to leverage ""surprisal"" as...","Based on the feedback, I'm going to be rejecti...",Also they don't propagate the prediction error...,17


### 2. Compute RSA

In [226]:
from pathlib import Path

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusTokenizer
import argparse
from tqdm import tqdm

from pickle import dump
from rsasumm.rsa_reranker import RSAReranking


In [228]:
debug = False
def parse_summaries(path: Path) -> pd.DataFrame:
    
    try:
        summaries = pd.read_csv(path)
    except:
        raise ValueError(f"Unknown dataset {path}")

    # check if the dataframe has the right columns
    if not all(
        col in summaries.columns for col in ["index", "id", "text", "gold", "summary", "id_candidate"]
    ):
        raise ValueError(
            "The dataframe must have columns ['index', 'id', 'text', 'gold', 'summary', 'id_candidate']"
        )

    return summaries


def compute_rsa(summaries: pd.DataFrame, model, tokenizer, device):
    results = []
    for name, group in tqdm(summaries.groupby(["id"])):
        if debug:
            print("---candidates---")
            print(group.summary.unique().tolist())
            print("---end candidates---")
            #print number of candidates
            print(f"number of candidates:  {len(group.summary.unique().tolist())}")
            #TODO: based on reviews_app.py at line 113, compute uniqueness scores
            #candidates = group.summary.unique().tolist()
            #speaker_df = speaker_df.applymap(lambda x: math.exp(x))
            #for candidate in candidates:
            #    get sentences of the candidate text_sentences=...
            #    text_1_summaries = speaker_df.loc[candidate][text_sentences]
        if not debug:
            rsa_reranker = RSAReranking(
                model,
                tokenizer,
                device=device,
                candidates=group.summary.unique().tolist(), #TODO: check what is this.
                source_texts=group.text.unique().tolist(),
                #batch_size=32,
                rationality=3,
            )
            (
                best_rsa,
                best_base,
                speaker_df,
                listener_df,
                initial_listener,
                language_model_proba_df,
                initial_consensuality_scores,
                consensuality_scores,
                uniqueness_scores,
            ) = rsa_reranker.rerank(t=2) #maybe you should return here the uniqueness scores
        
            gold = group['gold'].tolist()[0]
            results.append(
                {
                    "id": name,
                    "best_rsa": best_rsa,  # best speaker score
                    "best_base": best_base,  # naive baseline
                    "speaker_df": speaker_df,  # all speaker results
                    "listener_df": listener_df,  # all listener results (chances of guessing correctly)
                    "initial_listener": initial_listener,
                    "language_model_proba_df": language_model_proba_df,
                    "initial_consensuality_scores": initial_consensuality_scores,
                    "consensuality_scores": consensuality_scores,  # uniqueness scores # TODO: did you write or it was already there? Answer: no, I did not write it. Ask professor how to extract it from the consensuality_scores
                    "uniqueness_scores": uniqueness_scores,  # uniqueness scores
                    "gold": gold,
                    "rationality": 3,  # hyperparameter
                    "text_candidates" : group
                }
            )
    if not debug:
        return results
    else:
        return None

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/pegasus-xsum"
summaries=output_path
# load the model and the tokenizer
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
if "pegasus" in model_name: 
    print("Loading Pegasus Tokenizer")
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
else:
    print("Loading Auto Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model loaded")
model = model.to(device)
print("Model to device")
# load the summaries
summaries = parse_summaries(summaries)
print("Summaries loaded")
# rerank the summaries
print("Computing RSA...")
results = compute_rsa(summaries, model, tokenizer, device)
results = {"results": results} # wrap the results in a dictionary

results["metadata/reranking_model"] = model_name
results["metadata/rsa_iterations"] = 3



Loading model...


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Pegasus Tokenizer
Model loaded
Model to device
Summaries loaded
Computing RSA...


100%|██████████| 5/5 [01:54<00:00, 22.87s/it]
100%|██████████| 7/7 [03:22<00:00, 28.96s/it]]
100%|██████████| 2/2 [05:17<00:00, 158.53s/it]


In [217]:
summaries

Unnamed: 0,index,id,text,gold,summary,id_candidate
0,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,Summary: The paper presents low-rank bilinear ...,0
1,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper implements low-rank bilinear pooling...,1
2,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper presents various ablation studies of...,2
3,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,-Strengths:\n\n\n-1.,3
4,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper presents new insights into element-w...,4
5,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,-2.,5
6,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The paper presents a new model for the task of...,6
7,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,"However, I have concerns about the statistical...",7
8,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,-3.,8
9,0,https://openreview.net/forum?id=r1rhWnZkg,Summary: The paper presents low-rank bilinear ...,The program committee appreciates the authors'...,The various design choices made in model devel...,9


In [274]:
df_results = pd.DataFrame(results["results"])
df_results.shape[0]

2

In [287]:
##this is just to understand how to extract the uniqueness scores from the speaker_df in a single case. next cell implement this for each summary
import math
df_results = pd.DataFrame(results["results"])
index = 0
initial_text = df_results.iloc[index]["text_candidates"]["text"].unique()[0]
#get best rsa summary as concatenated text of array in best_rsa
df_results["best_rsa_summary"] = df_results["best_rsa"].apply(lambda x: " ".join(x))
df_results["best_base_summary"] = df_results["best_base"].apply(lambda x: " ".join(x))
speaker_df = df_results.iloc[index]["speaker_df"]
uniqueness_df = speaker_df.map(lambda x: math.exp(x))
print(type(uniqueness_df))
#text_1_summaries = uniqueness_df.loc[text1][text1_sentences] 
#text1_sentences = nltk.sent_tokenize(text1)
#get consensuality scores for the sentences in best rsa summary
uniqueness_scores= uniqueness_df.loc[initial_text][df_results["best_rsa"][index]].values
uniqueness_scores_norm = uniqueness_scores / uniqueness_scores.sum()
#associate text_summaries_norm to each sentence in the best rsa summary.
df_results["best_rsa_summary_uniqueness_scores"] = 0 #just to initialize
df_results["best_rsa_summary_uniqueness_scores"][0] = [uniqueness_scores_norm]
df_results["best_rsa_summary_uniqueness_scores"]
print(uniqueness_scores_norm)


<class 'pandas.core.frame.DataFrame'>
[9.99764439e-01 2.35561272e-04 2.09684636e-19]


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_results["best_rsa_summary_uniqueness_scores"][0] = [uniqueness_scores_norm]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [289]:
df_results = pd.DataFrame(results["results"])
#get best rsa summary as concatenated text of array in best_rsa
df_results["best_rsa_summary"] = df_results["best_rsa"].apply(lambda x: " ".join(x))
#get consensuality scores from speaker_df for the sentences in best rsa summary
df_results["initial_text"] =  df_results["text_candidates"].apply(lambda x: x["text"].unique()[0])
df_results["speaker_df"] = df_results["speaker_df"].apply(lambda x:   pd.DataFrame(x.map(lambda x: math.exp(x))))
df_results["uniqueness_scores"] = df_results.apply(lambda row: row["speaker_df"].loc[row["initial_text"]][row["best_rsa"]].values, axis=1)
df_results["uniqueness_scores_norm"] = df_results["uniqueness_scores"].apply(lambda x: x / x.sum())
df_results["uniqueness_scores_norm"][index]
df_results


Unnamed: 0,id,best_rsa,best_base,speaker_df,listener_df,initial_listener,language_model_proba_df,initial_consensuality_scores,consensuality_scores,uniqueness_scores,gold,rationality,text_candidates,best_rsa_summary,initial_text,uniqueness_scores_norm
0,"(https://openreview.net/forum?id=S1J0E-71l,)",[---Most of the equations presented in the pap...,[---Most of the equations presented in the pap...,...,...,...,...,Summary:\n---This paper proposes to use surpri...,Summary:\n---This paper proposes to use surpri...,"[0.13448573950974668, 3.1687096170732067e-05, ...","Based on the feedback, I'm going to be rejecti...",3,index ...,---Most of the equations presented in the pape...,Summary:--------This paper proposes to use sur...,"[0.9997644387276039, 0.00023556127239604917, 2..."
1,"(https://openreview.net/forum?id=r1rhWnZkg,)",[Summary: The paper presents low-rank bilinear...,[The paper presents various ablation studies o...,...,...,...,...,Summary: The paper presents low-rank bilinear ...,Summary: The paper presents low-rank bilinear ...,"[0.03539040452453201, 2.5440130312699124e-52, ...",The program committee appreciates the authors'...,3,index ...,Summary: The paper presents low-rank bilinear ...,Summary: The paper presents low-rank bilinear ...,"[0.9999999999993716, 7.18842597434807e-51, 6.2..."


In [None]:
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d-%H-%M-%S")
output_dir = "../../output/"
output_path = f"{output_dir}-_-r3-_-rsa_reranked-{model_name.replace('/', '-')}.pk"
#output_path_txt = (
#    Path(args.output_dir) / f"{args.summaries.stem}-_-r3-_-rsa_reranked-{model_name.replace('/', '-')}.txt"
#)
output_path_txt = f"{output_dir}-_-r3-_-rsa_reranked-{date}-{model_name.replace('/', '-')}.txt"


#write also results to txt file
with open(output_path_txt, "wb") as f:
    f.write(str(results).encode('utf-8'))

# in case of scripted run, print the output path
print(output_path)

In [None]:
#convert json to dataframe
df_results = pd.DataFrame(results["results"])

In [None]:
df_results["consensuality_scores"][0].count() #34
df_results["consensuality_scores"][0] #consensuality_scores

In [None]:
df_results["gold"] #this is the reference summary.

In [None]:
pd.set_option('display.max_colwidth', None)
#df_results["text_candidates"][0] # it contains a dataframe with the gold summary (real summary) and the candidates for each review proposed.

In [None]:
df_results["text_candidates"][0]
#index is composed by the number of sentences in the review and the document id. 
#id_candidate is the id representing the candidate sentence 
#TODO: evaluate how best_base and best_rsa are computed. How sentences are chosen?.

In [None]:
#TODO
#replicate in pandas what happens in the code of review_app.py for what concerns the computation of the uniqueness scores.
#validate with manual computation on first summary (which basically means to compute the uniqueness scores of the first summary). --done, validated
