# ROUGE evaluation

Calculates a ROUGE evaluation metrics (1, 2, 3, L) of summaries based on strategies: KL-SUM, TextRank (w/o weightning) comparing to the baseline summaries, and using the LLM generated reference summaries.

## Installations and imports

In [1]:
import pandas as pd
import json
!pip install rouge_score
from rouge_score import rouge_scorer

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f1c41a1d2850e15c1a5060a3d01ce2140a93b7dbf52eb612e8e1be90fa812d29
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


## Load data and build merged Dataframe of all summaries

In [2]:

with open('/content/summaries_with_reference_1k.json', 'r', encoding='utf-8') as f:
    ref_summaries = json.load(f)


with open('/content/summaries_kl_sum.json', 'r', encoding='utf-8') as f:
    kl_summaries = json.load(f)

with open('/content/summaries_w_text_rank_map_reduce.json', 'r', encoding='utf-8') as f:
    w_text_rank_summaries = json.load(f)

with open('/content/summaries_text_rank_map_reduce.json', 'r', encoding='utf-8') as f:
    text_rank_summaries = json.load(f)

In [3]:
ref_data = ref_summaries['data']
ref_df = pd.json_normalize(ref_data)

kl_data = kl_summaries['data']
kl_df = pd.json_normalize(kl_data)

w_text_rank_data = w_text_rank_summaries['data']
w_text_rank_df = pd.json_normalize(w_text_rank_data)
w_text_rank_df.rename(columns={'textrank_summary': 'w_textrank_summary'}, inplace=True)

text_rank_data = text_rank_summaries['data']
text_rank_df = pd.json_normalize(text_rank_data)

kl_df.drop(columns=['markdown_content', 'summary'], inplace=True)
text_rank_df.drop(columns=['markdown_content', 'summary'], inplace=True)
w_text_rank_df.drop(columns=['markdown_content', 'summary'], inplace=True)

merged_df = ref_df.merge(kl_df, on='url', suffixes=('_ref', '_kl'))
merged_df = merged_df.merge(w_text_rank_df, on='url', suffixes=('', '_wtr'))
merged_df = merged_df.merge(text_rank_df, on='url', suffixes=('', '_tr'))

In [4]:
merged_df.rename(columns={'ref_summary_kl': 'summary_kl', 'ref_summary_ref': 'summary_ref'}, inplace=True)
merged_df.columns

Index(['url', 'markdown_content', 'summary', 'summary_ref', 'summary_kl',
       'w_textrank_summary', 'textrank_summary'],
      dtype='object')

In [5]:
merged_df.head()

Unnamed: 0,url,markdown_content,summary,summary_ref,summary_kl,w_textrank_summary,textrank_summary
0,https://www.microsoft.com/wdsi/definitions,Latest security intelligence updates for Micro...,Latest security intelligence updates for Micro...,Microsoft security intelligence updates contin...,Skip to main content)\n* Tech & innovation T...,* Microsoft Threat Protection\nLatest securi...,* Microsoft Threat Protection\nLatest securi...
1,https://learn.microsoft.com/en-us/defender-end...,Microsoft Defender Antivirus security intellig...,Microsoft Defender Antivirus security intellig...,Summary:\nThe article explains how Microsoft D...,"1.\nTo see the most current engine, platform, ...",Microsoft Defender Antivirus security intellig...,* Microsoft Defender Antivirus security inte...
2,https://www.reddit.com/r/SCCM/comments/1954ghr...,Reddit - The heart of the internet\n\n========...,Reddit - The heart of the internet Image 1: r/...,The page is a Reddit post in r/SCCM about the ...,Skip to main content\nOpen menu Open navigatio...,* Movies & TV\nReddit - The heart of the int...,* Movies & TV\nReddit - The heart of the int...
3,https://www.catalog.update.microsoft.com/Searc...,Microsoft Update Catalog\n\n===============\n\...,Microsoft®Update CatalogFAQ|helpImage 1Image 2...,The page is a Microsoft Update Catalog FAQ dis...,Microsoft®Update CatalogFAQ|help view baske...,NET 8.0 | Security Updates | 6/10/2025 | n/a |...,NET 8.0 | Security Updates | 6/10/2025 | n/a |...
4,https://answers.microsoft.com/en-us/windows/fo...,KB2267602 continually wanting to update - Micr...,KB2267602 continually wanting to update - Micr...,- The issue: Windows Defender definition updat...,"Join us to grow your skills, build connections...",Upgrade to Microsoft Edge to take advantage of...,Upgrade to Microsoft Edge to take advantage of...


## Calculate score
Uses language detector to supply language to BERTScore evaluator

In [7]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=True)

In [8]:
# Initialize columns for all combinations
rouge_types = ['rouge1', 'rouge2', 'rouge3', 'rougeL']
summary_types = ['kl', 'wtr', 'baseline', 'tr']
measures = ['recall', 'precision', 'fmeasure']

for rouge_type in rouge_types:
    for summary_type in summary_types:
        for measure in measures:
            col_name = f'{rouge_type}_{measure}_{summary_type}'
            merged_df[col_name] = 0.0

# Calculate ROUGE scores for each row
for idx, row in merged_df.iterrows():
    reference = row['summary_ref']

    scores_kl = scorer.score(reference, row['summary_kl'])
    scores_wtr = scorer.score(reference, row['w_textrank_summary'])
    scores_baseline = scorer.score(reference, row['summary'])
    scores_tr = scorer.score(reference, row['textrank_summary'])

    for rouge_type in rouge_types:
        merged_df.at[idx, f'{rouge_type}_recall_kl'] = scores_kl[rouge_type].recall
        merged_df.at[idx, f'{rouge_type}_precision_kl'] = scores_kl[rouge_type].precision
        merged_df.at[idx, f'{rouge_type}_fmeasure_kl'] = scores_kl[rouge_type].fmeasure

        merged_df.at[idx, f'{rouge_type}_recall_wtr'] = scores_wtr[rouge_type].recall
        merged_df.at[idx, f'{rouge_type}_precision_wtr'] = scores_wtr[rouge_type].precision
        merged_df.at[idx, f'{rouge_type}_fmeasure_wtr'] = scores_wtr[rouge_type].fmeasure

        merged_df.at[idx, f'{rouge_type}_recall_baseline'] = scores_baseline[rouge_type].recall
        merged_df.at[idx, f'{rouge_type}_precision_baseline'] = scores_baseline[rouge_type].precision
        merged_df.at[idx, f'{rouge_type}_fmeasure_baseline'] = scores_baseline[rouge_type].fmeasure

        merged_df.at[idx, f'{rouge_type}_recall_tr'] = scores_tr[rouge_type].recall
        merged_df.at[idx, f'{rouge_type}_precision_tr'] = scores_tr[rouge_type].precision
        merged_df.at[idx, f'{rouge_type}_fmeasure_tr'] = scores_tr[rouge_type].fmeasure

print(f"Added {len(rouge_types) * len(summary_types) * len(measures)} ROUGE score columns to merged_df")
print(f"Total columns: {len(merged_df.columns)}")

Added 48 ROUGE score columns to merged_df
Total columns: 55


In [9]:
merged_df['content_length'] = merged_df['markdown_content'].str.len()
q33 = merged_df['content_length'].quantile(0.33)
q67 = merged_df['content_length'].quantile(0.67)
short_df = merged_df[merged_df['content_length'] <= q33].copy()
medium_df = merged_df[(merged_df['content_length'] > q33) & (merged_df['content_length'] <= q67)].copy()
long_df = merged_df[merged_df['content_length'] > q67].copy()

## Results

In [10]:
for type in rouge_types:
    print("-----")
    print(f"{type} fmeasure kl scores mean: {merged_df[f'{type}_fmeasure_kl'].mean()}")
    print(f"{type} fmeasure textrank scores mean: {merged_df[f'{type}_fmeasure_tr'].mean()}")
    print(f"{type} fmeasure weighted textrank scores mean: {merged_df[f'{type}_fmeasure_wtr'].mean()}")
    print(f"{type} fmeasure baseline scores mean: {merged_df[f'{type}_fmeasure_baseline'].mean()}")

-----
rouge1 fmeasure kl scores mean: 0.3528209799304532
rouge1 fmeasure textrank scores mean: 0.2973919720992021
rouge1 fmeasure weighted textrank scores mean: 0.3003672873936247
rouge1 fmeasure baseline scores mean: 0.30011383589865365
-----
rouge2 fmeasure kl scores mean: 0.12286681714070147
rouge2 fmeasure textrank scores mean: 0.10086704157496501
rouge2 fmeasure weighted textrank scores mean: 0.10158281238898206
rouge2 fmeasure baseline scores mean: 0.0985141074797333
-----
rouge3 fmeasure kl scores mean: 0.061420769449701706
rouge3 fmeasure textrank scores mean: 0.04938790393095719
rouge3 fmeasure weighted textrank scores mean: 0.04981575634634657
rouge3 fmeasure baseline scores mean: 0.047905495100763244
-----
rougeL fmeasure kl scores mean: 0.203970612961571
rougeL fmeasure textrank scores mean: 0.1703045284405102
rougeL fmeasure weighted textrank scores mean: 0.17217883166505668
rougeL fmeasure baseline scores mean: 0.18063079629154066


In [11]:
# Print scores for short documents
print("\n" + "="*60)
print("SHORT DOCUMENTS")
print("="*60)
for type in rouge_types:
    print("-----")
    print(f"{type} fmeasure kl scores mean: {short_df[f'{type}_fmeasure_kl'].mean():.4f}")
    print(f"{type} fmeasure tr scores mean: {short_df[f'{type}_fmeasure_tr'].mean():.4f}")
    print(f"{type} fmeasure wtr scores mean: {short_df[f'{type}_fmeasure_wtr'].mean():.4f}")
    print(f"{type} fmeasure baseline scores mean: {short_df[f'{type}_fmeasure_baseline'].mean():.4f}")

# Print scores for medium documents
print("\n" + "="*60)
print("MEDIUM DOCUMENTS")
print("="*60)
for type in rouge_types:
    print("-----")
    print(f"{type} fmeasure kl scores mean: {medium_df[f'{type}_fmeasure_kl'].mean():.4f}")
    print(f"{type} fmeasure tr scores mean: {medium_df[f'{type}_fmeasure_tr'].mean():.4f}")
    print(f"{type} fmeasure wtr scores mean: {medium_df[f'{type}_fmeasure_wtr'].mean():.4f}")
    print(f"{type} fmeasure baseline scores mean: {medium_df[f'{type}_fmeasure_baseline'].mean():.4f}")

# Print scores for long documents
print("\n" + "="*60)
print("LONG DOCUMENTS")
print("="*60)
for type in rouge_types:
    print("-----")
    print(f"{type} fmeasure kl scores mean: {long_df[f'{type}_fmeasure_kl'].mean():.4f}")
    print(f"{type} fmeasure tr scores mean: {long_df[f'{type}_fmeasure_tr'].mean():.4f}")
    print(f"{type} fmeasure wtr scores mean: {long_df[f'{type}_fmeasure_wtr'].mean():.4f}")
    print(f"{type} fmeasure baseline scores mean: {long_df[f'{type}_fmeasure_baseline'].mean():.4f}")



SHORT DOCUMENTS
-----
rouge1 fmeasure kl scores mean: 0.4631
rouge1 fmeasure tr scores mean: 0.4099
rouge1 fmeasure wtr scores mean: 0.4137
rouge1 fmeasure baseline scores mean: 0.3819
-----
rouge2 fmeasure kl scores mean: 0.1999
rouge2 fmeasure tr scores mean: 0.1741
rouge2 fmeasure wtr scores mean: 0.1733
rouge2 fmeasure baseline scores mean: 0.1582
-----
rouge3 fmeasure kl scores mean: 0.1089
rouge3 fmeasure tr scores mean: 0.0941
rouge3 fmeasure wtr scores mean: 0.0930
rouge3 fmeasure baseline scores mean: 0.0871
-----
rougeL fmeasure kl scores mean: 0.3070
rougeL fmeasure tr scores mean: 0.2488
rougeL fmeasure wtr scores mean: 0.2510
rougeL fmeasure baseline scores mean: 0.2537

MEDIUM DOCUMENTS
-----
rouge1 fmeasure kl scores mean: 0.3604
rouge1 fmeasure tr scores mean: 0.2825
rouge1 fmeasure wtr scores mean: 0.2891
rouge1 fmeasure baseline scores mean: 0.2963
-----
rouge2 fmeasure kl scores mean: 0.1140
rouge2 fmeasure tr scores mean: 0.0827
rouge2 fmeasure wtr scores mean: 0.0