# Libraries

In [None]:
from util import lda_summarizer, pager_summarizer
import pandas as pd
import os
import spacy
import evaluate
from tqdm import tqdm
from nltk.corpus import stopwords
from functools import partial
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


# Settings

In [None]:
load_dotenv() # .env loading

# Data path
cnn_path = os.getenv("CNN_DIR")
bbc_path = os.getenv("BBC_DIR")

# lm import through 'spacy'
nlp = spacy.load("en_core_web_sm")

# stopwords definition with custom list addition
stop_words = set(stopwords.words('english'))
added_stopwords = {
        "however", "yet", "although", "though", "even though", "nevertheless", "nonetheless",
        "still", "despite", "in spite of", "whereas", "alternatively", "instead", "regardless",
        "notwithstanding", "albeit", "conversely", "be that as it may", "even so", "that said",
        "even if", "except", "rather", "apart from", "despite that", "then again", "in contrast",
        "after all"
    }
    
all_stopwords = set(stop_words).union(added_stopwords)

# Functions

In [None]:
def build_lda_summary(nlp, row, num_s, nt, stop_words, bonus_weight=0.6, penalty_weight=0.4):
    try:
        return lda_summarizer(
            nlp=nlp,
            # text=row['news'], # if run 'cnn_dailymail' please uncomment this line
            text=row['content'], # if run 'bbc-news-data' please uncomment this line
            num_sentences=num_s,
            num_topics=nt,
            remove_stopwords=True,
            stop_words=all_stopwords,
            bonus_weight=bonus_weight,
            penalty_weight=penalty_weight
        )
    except Exception as e:
        return ""

def build_pager_summary(nlp, row, num_s, pg, lmbd_p, stop_words, model, use_mmr, bonus_weight=0.6, penalty_weight=0.4):
    try:
        return pager_summarizer(
            nlp=nlp,
            # text=row['news'], # if run 'cnn_dailymail' please uncomment this line
            text=row['content'], # if run 'bbc-news-data' please uncomment this line
            num_sentences=num_s,
            use_mmr=use_mmr,
            lambda_param=lmbd_p,
            remove_stopwords=True,
            stop_words=all_stopwords,
            bonus_weight=bonus_weight,
            penalty_weight=penalty_weight,
            pagerank_top_k=pg,
            embedding_model=model
        )
    except Exception as e:
        return ""

In [None]:
def compute_rouge_multiple_columns(df, prediction_cols, reference_col='ref_summary'):
    """
    ROUGE evaluation function using HuggingFace's evaluate library.

    This function evaluates the quality of multiple predicted summary columns against a reference summary column
    ('ref_summary') using ROUGE metrics (ROUGE-1, ROUGE-2, ROUGE-L). It returns a DataFrame reporting the average
    f-measure scores for each summarization method (identified by the column name).

    Args:
        df (pd.DataFrame): The DataFrame containing the predictions and reference summaries.
        prediction_cols (list): List of column names containing the generated summaries to evaluate.
        reference_col (str): Name of the column containing the reference summaries.

    Returns:
        pd.DataFrame: A DataFrame containing the average ROUGE scores per prediction column.
    """
    rouge = evaluate.load("rouge")
    results = {}

    for pred_col in prediction_cols:
        predictions = df[pred_col].fillna("").tolist()
        references = df[reference_col].fillna("").tolist()

        try:
            scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
        except Exception:
            scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        results[pred_col] = {
            'rouge1_f': round(scores['rouge1'], 3),
            'rouge2_f': round(scores['rouge2'], 3),
            'rougeL_f': round(scores['rougeL'], 3),
        }

    df_result = pd.DataFrame(results).T.reset_index()
    df_result = df_result.rename(columns={"index": "model"})
    return df_result.sort_values('rouge1_f', ascending=False)


# Tuning & Summary building

The aim of the current tuning procedure is to evaluate summarization models using the ROUGE score as the primary evaluation metric.<br>
Since the original `bbc-news-data` dataset does not include reference summaries, the models previously developed in `model_building.py` are now tested on a different dataset: CNN/DailyMail.<br> This dataset collects news articles from both CNN and the Daily Mail, and includes the full text of each article with highlights summaries which have been written by humans.<br> These highlights consist of one or multiple sentences that are either directly extracted from the article or closely paraphrased.<br>
Despite row data can be found here https://github.com/google-deepmind/rc-data , a preprocessed-short version of the dataset has been downloaded from https://www.kaggle.com/datasets/yatharthgautam123789/cnn-dailymail-3-0-0

## Read data

In [None]:
df_cnn = pd.read_csv(cnn_path, sep=",", usecols=['article', 'highlights']) # cnn df read
df_cnn = df_cnn.rename(columns={'article': 'news', 'highlights': 'ref_summary'}) # col rename
df_cnn = df_cnn.sample(frac=0.05, random_state=123) # Using 5% of the rows for sustainable tuningg

## Summary extraction

A grid search approach is performed to tune hyperparameters, by taking into account the same values which have been used in `model_building.py`.<br>
That is because of the possibility of making comparison between **BLANC** and **ROUGE** scores, although evaluated over different datasets.<br>
To ensure the tuning process remains computationally manageable, a small random batch of the data (5%) was extracted.<br>
The procedure returns a pandas.DataFrame containing, for each iteration of hyperparameter values, the summary column named as the specific values-combination. The values of the column are the related summaries built with that combination.

In [None]:
tqdm.pandas()

# Grid search using custom-defined parameter values
b_ws = [0.0, 0.6]
p_ws = [0.0, 0.6]
lambda_param = [0.2, 0.8]
num_sent = [2, 5]
num_top = [2, 4]
pgs = [None, 10]
models = [None, model]
use_mmr = [True, False]

for num in num_sent:
    for n_t in num_top:
        for b_w in b_ws:
            for p_w in p_ws:
                # LDA
                lda_func = partial(build_lda_summary, nlp, num_s=num, nt=n_t, stop_words=all_stopwords, bonus_weight=b_w, penalty_weight=p_w)
                df_cnn[f'lda_summary_ns{num}_nt{n_t}_bw{b_w}_pw{p_w}'] = df_cnn.progress_apply(lda_func, axis=1)

                # Pagerank-MMR
                for lmbd in lambda_param:
                    for pg in pgs:
                        for mod in models:
                            for um in use_mmr:
                                pager_func = partial(build_pager_summary, nlp, pg=pg, num_s=num, lmbd_p=lmbd, stop_words=all_stopwords, bonus_weight=b_w, penalty_weight=p_w, model=mod, use_mmr=um)
                                df_cnn[f'pager_summary_ns{num}_bw{b_w}_pw{p_w}_l{lmbd}_pg{pg}_mod{mod}_um{um}'] = df_cnn.progress_apply(pager_func, axis=1)

100%|██████████| 500/500 [01:15<00:00,  6.67it/s]
100%|██████████| 500/500 [00:42<00:00, 11.75it/s]
100%|██████████| 500/500 [00:38<00:00, 12.88it/s]
100%|██████████| 500/500 [01:31<00:00,  5.47it/s]
100%|██████████| 500/500 [01:28<00:00,  5.65it/s]
100%|██████████| 500/500 [00:40<00:00, 12.28it/s]
100%|██████████| 500/500 [00:38<00:00, 13.03it/s]
100%|██████████| 500/500 [01:29<00:00,  5.61it/s]
100%|██████████| 500/500 [01:28<00:00,  5.67it/s]
100%|██████████| 500/500 [00:42<00:00, 11.84it/s]
100%|██████████| 500/500 [00:38<00:00, 12.99it/s]
100%|██████████| 500/500 [01:32<00:00,  5.39it/s]
100%|██████████| 500/500 [01:28<00:00,  5.64it/s]
100%|██████████| 500/500 [00:40<00:00, 12.46it/s]
100%|██████████| 500/500 [00:39<00:00, 12.77it/s]
100%|██████████| 500/500 [01:29<00:00,  5.57it/s]
100%|██████████| 500/500 [01:28<00:00,  5.63it/s]
100%|██████████| 500/500 [01:13<00:00,  6.84it/s]
100%|██████████| 500/500 [00:42<00:00, 11.70it/s]
100%|██████████| 500/500 [00:39<00:00, 12.77it/s]


## ROUGE computing

Once the dataframe containing the summary columns is arrenged, ROUGE evaluation is performed column-wise: this means that the ROUGE scores are computed for each summary in a given column and then averaged to obtain an overall score for that summarization method.<br>
The evaluation returns three distinct ROUGE metrics:

* **rouge_1**: measures the overlap between the generated and reference summaries based on unigrams.<br>
* **rouge_2**: evaluates the overlap at the level of bigrams.<br>
* **rouge_L**: the metric is based on the Longest Common Subsequence.

In [None]:
prediction_cols = [col for col in df_cnn.columns if col.startswith('lda_summary') or col.startswith('pager_summary')]
df_rouge = compute_rouge_multiple_columns(df_cnn, prediction_cols)
# df_rouge.to_csv('Data/results/rouge_scores.csv', sep=";", index=False) # results df writing
df_rouge

Unnamed: 0,model,rouge1_f,rouge2_f,rougeL_f
77,pager_summary_ns5_bw0.0_pw0.0_l0.8_pgNone_modN...,0.243718,0.052074,0.145666
94,pager_summary_ns5_bw0.0_pw0.6_l0.8_pgNone_modN...,0.241690,0.050218,0.142450
96,pager_summary_ns5_bw0.0_pw0.6_l0.8_pgNone_modS...,0.241330,0.044980,0.139910
69,pager_summary_ns5_bw0.0_pw0.0_l0.2_pgNone_modN...,0.241022,0.043664,0.140792
86,pager_summary_ns5_bw0.0_pw0.6_l0.2_pgNone_modN...,0.240868,0.041782,0.138236
...,...,...,...,...
93,pager_summary_ns5_bw0.0_pw0.6_l0.2_pg10_modSen...,0.177420,0.041240,0.106366
110,pager_summary_ns5_bw0.6_pw0.0_l0.2_pg10_modSen...,0.177420,0.041240,0.106366
97,pager_summary_ns5_bw0.0_pw0.6_l0.8_pgNone_modS...,0.177420,0.041240,0.106366
106,pager_summary_ns5_bw0.6_pw0.0_l0.2_pgNone_modS...,0.177420,0.041240,0.106366


## BBC Dataframe with 'summary' columns building

After ROUGE performing, the best rouge1 scored models is used to extract summaries of `bbc-news-data` fulltext.<br>
In order to get 3 main different types of model-built summaries, the following are taken into account:

* pagerank + MMR using Tf-Idf representation
* pagerank + MMR using a sentence transformers-based model for sentence embedding
* lda 

The rest of hyperparameter values are set before each model running.

In [None]:
df_bbc = pd.read_csv(bbc_path, sep="\t") # bbc df read

In [None]:
tqdm.pandas()


ns_1 = 5
lamb_1 = 0.8
bw_1 = 0.0
pw_1 = 0.0
um_1 = True
pg_1= None
mod_1 = None

pager_func_1 = partial(build_pager_summary, nlp, num_s=ns_1, pg=pg_1, 
                     lmbd_p=lamb_1, stop_words=all_stopwords, model=mod_1, 
                     use_mmr=um_1, bonus_weight = bw_1, penalty_weight = pw_1)
df_bbc['summary_tfidf'] = df_bbc.progress_apply(pager_func_1, axis=1)

ns_2 = 5
lamb_2 = 0.8
bw_2 = 0.0
pw_2 = 0.6
um_2 = True
pg_2= None
mod_2 = model

pager_func_2 = partial(build_pager_summary, nlp, num_s=ns_2, pg=pg_2, 
                     lmbd_p=lamb_2, stop_words=all_stopwords, model=mod_2, 
                     use_mmr=um_2, bonus_weight = bw_2, penalty_weight = pw_2)
df_bbc['summary_mod'] = df_bbc.progress_apply(pager_func_2, axis=1)

ns_3 = 5
nt = 2
bw_3 = 0.0
pw_3 = 0.6

lda_func = partial(build_lda_summary, nlp, num_s=ns_3, nt=nt, 
                     stop_words=all_stopwords, bonus_weight = bw_3, 
                     penalty_weight = pw_3)
df_bbc['summary_lda'] = df_bbc.progress_apply(lda_func, axis=1)

# df_bbc.to_csv('Data/output/bbc-news-data-summaries-new.csv', sep='\t', index=False)

100%|██████████| 2225/2225 [02:19<00:00, 15.98it/s]
100%|██████████| 2225/2225 [04:26<00:00,  8.36it/s]
100%|██████████| 2225/2225 [03:09<00:00, 11.76it/s]
