# Libraries

In [None]:
import re
from util import lda_summarizer, pager_summarizer
import pandas as pd
import os
import spacy
import evaluate
from tqdm import tqdm
from nltk.corpus import stopwords
from functools import partial
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv



# Settings

In [None]:
load_dotenv() # .env loading
model = SentenceTransformer('all-MiniLM-L6-v2') # sentence transformers model init

# Data path
cnn_path = os.getenv("CNN_DIR")
bbc_path = os.getenv("BBC_DIR")

# lm import through 'spacy'
nlp = spacy.load("en_core_web_sm")

# stopwords definition with custom list addition
stop_words = set(stopwords.words('english'))
added_stopwords = {
        "however", "yet", "although", "though", "even though", "nevertheless", "nonetheless",
        "still", "despite", "in spite of", "whereas", "alternatively", "instead", "regardless",
        "notwithstanding", "albeit", "conversely", "be that as it may", "even so", "that said",
        "even if", "except", "rather", "apart from", "despite that", "then again", "in contrast",
        "after all"
    }
    
all_stopwords = set(stop_words).union(added_stopwords)

# Functions

In [None]:
def clean_cnn_patterns(text):
    """
    Cleans input text by removing specific patterns
    The following are performed:
    - Returns the input unchanged if it is not a string.
    - Removes all characters up to and including the '(CNN)' tag and any following dashes.
    - Removes tags as '(CNET)' and '(WIRED)'.
    - Removes initial parenthetical phrases.
    - Removes initial HTML tags.
    - Strips out leading non-word characters (e.g., punctuation).
    - Collapses multiple spaces into a single space.
    - Returns the cleaned and trimmed string.
    """

    if not isinstance(text, str):
        return text

    text = re.sub(r'^.*?\(CNN\)\s*[-–—]*\s*', '', text) # text up to (CNN) removal
    text = re.sub(r'\((CNET|WIRED)\)', '', text) # specific intial pattern removal
    text = re.sub(r'^\s*\([^)]*\)\s*', '', text) # parantheses with pattern removal
    text = re.sub(r'^\s*<[^>]+>\s*', '', text) # html tags removal
    text = re.sub(r'^[^\w]+', '', text) # initial punkt removal
    text = re.sub(r'\s+', ' ', text) # multiple spaces removal
    return text.strip()

def clean_fulltext_column(df, column):
    df[column] = df[column].apply(clean_cnn_patterns)
    return df


In [None]:
def clean_prefix_ref_summary(text):
    """
    Cleans input text by removing specific patterns
    The following are performed:
     - Returns the input unchanged if it is not a string.
     - Removes initial patterns as 'NEW:' and 'CDC:'.
     - Leaves a single whitspace in case of multiple blanks
     - Normalizes quotes
     - Removes all characters but alphanumeric and a short set of symbols
    """
    if not isinstance(text, str):
        return text

    text = re.sub(r'^(NEW:|CDC:)\s*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[“”’]', " ", text)  
    text = re.sub(r'[^a-zA-Z0-9%£$.,;:\-\s]', '', text)
    return text.strip()

def clean_ref_summary_column(df, column):
    df[column] = df[column].apply(clean_prefix_ref_summary)
    return df

In [None]:
def build_lda_summary(nlp, row, num_s, nt, stop_words, bonus_weight=0.6, penalty_weight=0.4):
    """
    Generates an extractive summary of a news article using LDA-based topic modeling.

    Parameters:
    - nlp: A spaCy language model used for tokenization.
    - row: A row from a DataFrame containing the article text (expects a 'content' or 'news' field).
    - num_s: Desired number of sentences in the summary.
    - nt: Number of topics to consider in the LDA model.
    - stop_words: List or set of stopwords to remove.
    - bonus_weight: Weight factor to boost topic-relevant sentences.
    - penalty_weight: Weight factor to penalize redundant or off-topic sentences.

    Returns:
    - A string containing the summary, or an empty string in case of error.
    """
    
    try:
        return lda_summarizer(
            nlp=nlp,
            # text=row['news'], # if run 'cnn_dailymail' please uncomment this line
            text=row['content'], # if run 'bbc-news-data' please uncomment this line
            num_sentences=num_s,
            num_topics=nt,
            remove_stopwords=True,
            stop_words=all_stopwords,
            bonus_weight=bonus_weight,
            penalty_weight=penalty_weight
        )
    except Exception as e:
        return ""

def build_pager_summary(nlp, row, num_s, pg, use_mmr, lmbd_p, stop_words, model, 
                        bonus_weight=0.6, penalty_weight=0.4):
    """
    Generates an extractive summary of a news article using PageRank.

    Parameters:
    - nlp: A spaCy language model used for tokenization.
    - row: A row from a DataFrame containing the article text (expects a 'content' or 'news' field).
    - num_s: Desired number of sentences in the summary.
    - pg: Custom integer value chosen as the top-k rank sentences to be extracted by PageRank.
    - use_mmr: If True, Maximum Marginal Relevance is performed.
    - lmbd_p: Lambda parameter used by MMR.
    - stop_words: List or set of stopwords to remove.
    - bonus_weight: Weight factor to boost topic-relevant sentences.
    - penalty_weight: Weight factor to penalize redundant or off-topic sentences.

    Returns:
    - A string containing the summary, or an empty string in case of error.
    """
    try:
        return pager_summarizer(
            nlp=nlp,
            # text=row['news'], # if run 'cnn_dailymail' please uncomment this line
            text=row['content'], # if run 'bbc-news-data' please uncomment this line
            num_sentences=num_s,
            use_mmr=use_mmr,
            lambda_param=lmbd_p,
            remove_stopwords=True,
            stop_words=all_stopwords,
            bonus_weight=bonus_weight,
            penalty_weight=penalty_weight,
            pagerank_top_k=pg,
            embedding_model=model
        )
    except Exception as e:
        return ""

In [6]:
def compute_rouge_multiple_columns(df, prediction_cols, reference_col='ref_summary'):
    """
    ROUGE evaluation function using HuggingFace's evaluate library.

    This function evaluates the quality of multiple predicted summary columns against a reference summary column
    ('ref_summary') using ROUGE metrics (ROUGE-1, ROUGE-2, ROUGE-L). It returns a DataFrame reporting the average
    f-measure scores for each summarization method (identified by the column name).

    Args:
        df (pd.DataFrame): The DataFrame containing the predictions and reference summaries.
        prediction_cols (list): List of column names containing the generated summaries to evaluate.
        reference_col (str): Name of the column containing the reference summaries.

    Returns:
        pd.DataFrame: A DataFrame containing the average ROUGE scores per prediction column.
    """
    rouge = evaluate.load("rouge")
    results = {}

    for pred_col in prediction_cols:
        predictions = df[pred_col].fillna("").tolist()
        references = df[reference_col].fillna("").tolist()

        try:
            scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
        except Exception:
            scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

        results[pred_col] = {
            'rouge1_f': round(scores['rouge1'], 3),
            'rouge2_f': round(scores['rouge2'], 3),
            'rougeL_f': round(scores['rougeL'], 3),
        }

    df_result = pd.DataFrame(results).T.reset_index()
    df_result = df_result.rename(columns={"index": "model"})
    return df_result.sort_values('rouge1_f', ascending=False)


# Tuning & Summary building

The aim of the current tuning procedure is to evaluate summarization models using the ROUGE score as the primary evaluation metric.<br>
Since the original `bbc-news-data` dataset does not include reference summaries, the models previously developed in `model_building.py` are now tested on a different dataset: CNN/DailyMail.<br> This dataset collects news articles from both CNN and the Daily Mail, and includes the full text of each article with highlights summaries which have been written by humans.<br> These highlights consist of one or multiple sentences that are either directly extracted from the article or closely paraphrased.<br>
Despite row data can be found here https://github.com/google-deepmind/rc-data , a preprocessed-short version of the dataset has been downloaded from https://www.kaggle.com/datasets/yatharthgautam123789/cnn-dailymail-3-0-0

## Read data

In [None]:
df_cnn = pd.read_csv(cnn_path, sep=",", usecols=['article', 'highlights']) # cnn df read
df_cnn = df_cnn.rename(columns={'article': 'news', 'highlights': 'ref_summary'}) # col rename
df_cnn = df_cnn.sample(frac=0.01, random_state=123) # Using 5% of the rows for sustainable tuningg

#### 'news' and 'summary' columns clean up

In [8]:
df_cnn_cln = clean_fulltext_column(df_cnn, column='news')
df_cnn_cln = clean_ref_summary_column(df_cnn, column='ref_summary')
df_cnn_cln

Unnamed: 0,news,ref_summary
2656,Bayern German manager Van Der van Sar today ex...,Louis Van Gaal signs a contract extension with...
445,The death toll from a shooting at a house part...,A prosecutor says 14 people were killed and 14...
9505,The A Week In America ended telling this quint...,Pueblos and powwows in New Mexico highlight Na...
332,Only a third of U.S. voters think that most me...,Just 34 percent of U.S. voters think most memb...
4168,"Se wayne Johnson , in his , second of the matc...",Mitchell Johnson inspires Australia to innings...
2364,"The problem is spread across Greek , rather th...","Greece is close to Turkey, has an influx of il..."
6097,As its International games has been reinstated...,Its the first time a ban of a national committ...
7,While Labor Day is the unofficial end of summe...,Labor Day is the unofficial end of summer and ...
7752,"according To Liu Yeqing li , the woman woman m...",Chinas one-child policy results in forced abor...
4453,As Democrats miss another critical opportunity...,Democrats say they have no regrets over changi...


## Summary extraction

A grid search approach is performed to tune hyperparameters, by taking into account the same values which have been used in `model_building.py`.<br>
To ensure the tuning process remains computationally manageable, a small random batch of the data (5%) was extracted.<br>
The procedure returns a pandas.DataFrame containing, for each iteration of hyperparameter values, the summary column named as the specific values-combination. The values of the column are the related summaries built with that combination.

In [None]:
tqdm.pandas()

# Grid search using custom-defined parameter values
num_sent = [2, 5] 
num_topic = [2, 4] 
bonus_weights = [0.0, 0.6]
penalty_weights = [0.0, 0.6]
lamb_list = [0.4, 0.8]
pgs = [None, 10]
models = [None, model]
use_mmr = [True, False]

# LDA
for num in num_sent:
    for n_t in num_topic:
        for b_w in bonus_weights:
            for p_w in penalty_weights:
                lda_func = partial(build_lda_summary, nlp, num_s=num, nt=n_t, stop_words=all_stopwords, bonus_weight=b_w, penalty_weight=p_w)
                df_cnn_cln[f'lda_ns{num}_nt{n_t}_bw{b_w}_pw{p_w}'] = df_cnn_cln.progress_apply(lda_func, axis=1)

                # Pagerank-MMR
                for lmbd in lamb_list:
                    for pg in pgs:
                        for mod in models:
                            for um in use_mmr:
                                pager_func = partial(build_pager_summary, nlp, pg=pg, num_s=num, lmbd_p=lmbd, stop_words=all_stopwords, bonus_weight=b_w, penalty_weight=p_w, model=mod, use_mmr=um)
                                df_cnn_cln[f'pr_ns{num}_l{lmbd}_bw{b_w}_pw{p_w}_pg{pg}_um{um}_mod{mod}'] = df_cnn_cln.progress_apply(pager_func, axis=1)

## ROUGE computing

Once the dataframe containing the summary columns is arrenged, ROUGE evaluation is performed column-wise: this means that the ROUGE scores are computed for each summary in a given column and then averaged to obtain an overall score for that summarization method.<br>
The evaluation returns three distinct ROUGE metrics:

* **rouge_1**: measures the overlap between the generated and reference summaries based on unigrams.<br>
* **rouge_2**: evaluates the overlap at the level of bigrams.<br>
* **rouge_L**: the metric is based on the Longest Common Subsequence.

In [None]:
prediction_cols = [col for col in df_cnn_cln.columns if col.startswith('lda') or col.startswith('pr')]
df_rouge = compute_rouge_multiple_columns(df_cnn_cln, prediction_cols)
df_rouge
# df_rouge.to_csv('Data/results/rouge_scores.csv', sep=";", index=False) # results df writing

Unnamed: 0,model,rouge1_f,rouge2_f,rougeL_f
9,pr_ns2_l0.8_bw0.0_pw0.0_pgNone_umTrue_modNone,0.323,0.059,0.190
13,pr_ns2_l0.8_bw0.0_pw0.0_pg10_umTrue_modNone,0.300,0.068,0.186
43,pr_ns2_l0.8_bw0.6_pw0.0_pgNone_umTrue_modNone,0.297,0.059,0.185
47,pr_ns2_l0.8_bw0.6_pw0.0_pg10_umTrue_modNone,0.292,0.067,0.183
73,pr_ns5_l0.4_bw0.0_pw0.0_pgNone_umTrue_modNone,0.290,0.061,0.172
...,...,...,...,...
125,pr_ns5_l0.4_bw0.6_pw0.6_pgNone_umFalse_modNone,0.216,0.068,0.143
140,lda_ns5_nt4_bw0.0_pw0.0,0.214,0.053,0.126
37,pr_ns2_l0.4_bw0.6_pw0.0_pgNone_umTrue_modSente...,0.204,0.037,0.139
3,pr_ns2_l0.4_bw0.0_pw0.0_pgNone_umTrue_modSente...,0.202,0.033,0.132


## Unifying scoring metrics

Since a previous unsupervised evaluation was conducted on the original dataset (namely `bbc-news-data`), the same tuning procedure was applied.<br>
Also in that case, a model-dependent summarization was performed for each iteration over the data batch.
The quality of the summaries was assessed using the **BLANC** score, and the results were stored in a .csv file.<br>
To obtain a unified overall metric to guide the selection of the best-performing models, the sum of **ROUGE** and **BLANC** scores corresponding to each model was considered.

In [None]:
blanc_s = pd.read_csv(r"C:\Users\dibmir\text-summarization\blanc_scores.csv", sep=";")
rouge_s = pd.read_csv(r"C:\Users\dibmir\text-summarization\Data\results\rouge_scores.csv", sep=";")

In [None]:
score_merged = pd.merge(left=blanc_s, right=rouge_s, on='model', how='inner')
score_merged['unified_score'] = score_merged['avg_blanc_score'] + score_merged['rouge1_f']
score_merged_sort = score_merged.sort_values('unified_score', ascending=False)
score_merged_sort[['model', 'avg_blanc_score', 'rouge1_f', 'unified_score']]

Unnamed: 0,model,avg_blanc_score,rouge1_f,unified_score
136,pr_ns5_l0.8_bw0.6_pw0.6_pgNone_umTrue_modNone,0.302983,0.268,0.570983
113,pr_ns5_l0.8_bw0.0_pw0.0_pgNone_umTrue_modSente...,0.286883,0.283,0.569883
10,lda_ns5_nt2_bw0.6_pw0.0,0.318033,0.243,0.561033
141,pr_ns5_l0.8_bw0.6_pw0.6_pg10_umTrue_modSentenc...,0.301333,0.259,0.560333
125,pr_ns5_l0.8_bw0.0_pw0.6_pg10_umTrue_modSentenc...,0.296016,0.264,0.560016
...,...,...,...,...
27,pr_ns2_l0.4_bw0.0_pw0.6_pgNone_umFalse_modSent...,0.142867,0.235,0.377867
0,lda_ns2_nt2_bw0.0_pw0.0,0.178208,0.197,0.375208
25,pr_ns2_l0.4_bw0.0_pw0.6_pgNone_umTrue_modSente...,0.142867,0.231,0.373867
33,pr_ns2_l0.4_bw0.6_pw0.0_pgNone_umTrue_modSente...,0.151509,0.204,0.355509


## BBC Dataframe with 'summary' columns building

The top-3 best performer models are extracted to build summaries on the original dataset.<br>
Specifically, models address the following:

* PageRank + MMR using Tf-Idf representation
* PageRank + MMR using a sentence transformers-based model for sentence embeddings
* LDA 

The rest of hyperparameter values are set before each model running.

In [27]:
df_bbc = pd.read_csv(bbc_path, sep="\t") # bbc df read

### Summaries building

In [None]:
tqdm.pandas()

# # # PageRank + MMR using tfidf
ns_1 = 5
lamb_1 = 0.8
bw_1 = 0.6
pw_1 = 0.6
um_1 = True
pg_1= None
mod_1 = None

pager_func_1 = partial(build_pager_summary, nlp, num_s=ns_1, pg=pg_1, 
                     lmbd_p=lamb_1, stop_words=all_stopwords, model=mod_1, 
                     use_mmr=um_1, bonus_weight = bw_1, penalty_weight = pw_1)
df_bbc['summary_tfidf'] = df_bbc.progress_apply(pager_func_1, axis=1)

# # # PageRank + MMR using sentence transformers
ns_2 = 5
lamb_2 = 0.8
bw_2 = 0.0
pw_2 = 0.0
um_2 = True
pg_2= None
mod_2 = model

pager_func_2 = partial(build_pager_summary, nlp, num_s=ns_2, pg=pg_2, 
                     lmbd_p=lamb_2, stop_words=all_stopwords, model=mod_2, 
                     use_mmr=um_2, bonus_weight = bw_2, penalty_weight = pw_2)
df_bbc['summary_transf'] = df_bbc.progress_apply(pager_func_2, axis=1)


# # # LDA
ns_3 = 5
nt = 2
bw_3 = 0.6
pw_3 = 0.0

lda_func = partial(build_lda_summary, nlp, num_s=ns_3, nt=nt, 
                     stop_words=all_stopwords, bonus_weight = bw_3, 
                     penalty_weight = pw_3)
df_bbc['summary_lda'] = df_bbc.progress_apply(lda_func, axis=1)


In [None]:
# df_bbc.to_csv('Data/output/bbc-news-data-summaries-new.csv', sep='\t', index=False) # bbc df with summary cols writing