# BERTScore evaluation

Calculates BERTScore precision, recall, and F1 of summaries based on strategies: KL-SUM, TextRank (w/o weightning) comparing to the baseline summaries, and using the LLM generated reference summaries.

## Installations and imports

In [1]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m31.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=e5f2d33c929c0f8804c77cdb5b3c5047966c67329a9f1b2ff296b7c2d2ca7995
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import pandas as pd
import json
from bert_score import score
from langdetect import detect, LangDetectException
import numpy as np

## Load data and build data frame

In [None]:
with open('/content/summaries_with_reference_1k.json', 'r', encoding='utf-8') as f:
    ref_summaries = json.load(f)


with open('/content/summaries_kl_sum.json', 'r', encoding='utf-8') as f:
    kl_summaries = json.load(f)

with open('/content/summaries_w_text_rank_map_reduce.json', 'r', encoding='utf-8') as f:
    w_text_rank_summaries = json.load(f)

with open('/content/summaries_text_rank_map_reduce.json', 'r', encoding='utf-8') as f:
    text_rank_summaries = json.load(f)

In [None]:
ref_data = ref_summaries['data']
ref_df = pd.json_normalize(ref_data)

kl_data = kl_summaries['data']
kl_df = pd.json_normalize(kl_data)

w_text_rank_data = w_text_rank_summaries['data']
w_text_rank_df = pd.json_normalize(w_text_rank_data)
w_text_rank_df.rename(columns={'textrank_summary': 'w_textrank_summary'}, inplace=True)

text_rank_data = text_rank_summaries['data']
text_rank_df = pd.json_normalize(text_rank_data)

kl_df.drop(columns=['markdown_content', 'summary'], inplace=True)
text_rank_df.drop(columns=['markdown_content', 'summary'], inplace=True)
w_text_rank_df.drop(columns=['markdown_content', 'summary'], inplace=True)

merged_df = ref_df.merge(kl_df, on='url', suffixes=('_ref', '_kl'))
merged_df = merged_df.merge(w_text_rank_df, on='url', suffixes=('', '_wtr'))
merged_df = merged_df.merge(text_rank_df, on='url', suffixes=('', '_tr'))

In [12]:
merged_df.rename(columns={'ref_summary_kl': 'summary_kl', 'ref_summary_ref': 'summary_ref'}, inplace=True)
merged_df.columns

Index(['url', 'markdown_content', 'summary', 'summary_ref', 'summary_kl',
       'w_textrank_summary', 'textrank_summary'],
      dtype='object')

## Calculate score
Uses language detector to supply language to BERTScore evaluator

In [14]:
# Detect language for each reference summary
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'en'  # Default to English

merged_df['language'] = merged_df['summary_ref'].apply(detect_language)

In [None]:
# Calculate BERTScore for each summary type, grouped by language
# Using bert-base-uncased (light model)

# Initialize arrays to store results in the original order
P_kl = np.zeros(len(merged_df))
R_kl = np.zeros(len(merged_df))
F1_kl = np.zeros(len(merged_df))

P_tr = np.zeros(len(merged_df))
R_tr = np.zeros(len(merged_df))
F1_tr = np.zeros(len(merged_df))

P_wtr = np.zeros(len(merged_df))
R_wtr = np.zeros(len(merged_df))
F1_wtr = np.zeros(len(merged_df))

P_baseline = np.zeros(len(merged_df))
R_baseline = np.zeros(len(merged_df))
F1_baseline = np.zeros(len(merged_df))

# Process each language group separately
for lang in merged_df['language'].unique():
    lang_mask = merged_df['language'] == lang
    lang_indices = merged_df[lang_mask].index.tolist()

    print(f"\n{'='*60}")
    print(f"Processing language: {lang} ({sum(lang_mask)} documents)")
    print(f"{'='*60}")

    # Get summaries for this language
    refs = merged_df.loc[lang_mask, 'summary_ref'].tolist()
    kl_sums = merged_df.loc[lang_mask, 'summary_kl'].tolist()
    tr_sums = merged_df.loc[lang_mask, 'textrank_summary'].tolist()
    wtr_sums = merged_df.loc[lang_mask, 'w_textrank_summary'].tolist()
    baseline_sums = merged_df.loc[lang_mask, 'summary'].tolist()

    # Calculate BERTScore for KL summaries
    print(f"  Calculating BERTScore for KL summaries...")
    P_kl_lang, R_kl_lang, F1_kl_lang = score(
        kl_sums, refs,
        lang=lang,
        model_type='bert-base-multilingual-cased',
        verbose=False
    )

    # Calculate BERTScore for weighted TextRank summaries
    print(f"  Calculating BERTScore for weighted TextRank summaries...")
    P_wtr_lang, R_wtr_lang, F1_wtr_lang = score(
        wtr_sums, refs,
        lang=lang,
        model_type='bert-base-multilingual-cased',
        verbose=False
    )

    # Calculate BERTScore for weighted TextRank summaries
    print(f"  Calculating BERTScore for TextRank summaries...")
    P_tr_lang, R_tr_lang, F1_tr_lang = score(
        tr_sums, refs,
        lang=lang,
        model_type='bert-base-multilingual-cased',
        verbose=False
    )

    # Calculate BERTScore for baseline summaries
    print(f"  Calculating BERTScore for baseline summaries...")
    P_baseline_lang, R_baseline_lang, F1_baseline_lang = score(
        baseline_sums, refs,
        lang=lang,
        model_type='bert-base-multilingual-cased',
        verbose=False
    )

    # Store results at the correct indices
    for i, orig_idx in enumerate(lang_indices):
        P_kl[orig_idx] = P_kl_lang[i].item()
        R_kl[orig_idx] = R_kl_lang[i].item()
        F1_kl[orig_idx] = F1_kl_lang[i].item()

        P_tr[orig_idx] = P_tr_lang[i].item()
        R_tr[orig_idx] = R_tr_lang[i].item()
        F1_tr[orig_idx] = F1_tr_lang[i].item()

        P_wtr[orig_idx] = P_wtr_lang[i].item()
        R_wtr[orig_idx] = R_wtr_lang[i].item()
        F1_wtr[orig_idx] = F1_wtr_lang[i].item()

        P_baseline[orig_idx] = P_baseline_lang[i].item()
        R_baseline[orig_idx] = R_baseline_lang[i].item()
        F1_baseline[orig_idx] = F1_baseline_lang[i].item()

print("\n" + "="*60)
print("BERTScore calculation complete for all languages!")
print("="*60)


Processing language: en (598 documents)
  Calculating BERTScore for KL summaries...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



  Calculating BERTScore for weighted TextRank summaries...




  Calculating BERTScore for TextRank summaries...




  Calculating BERTScore for baseline summaries...





Processing language: ja (38 documents)
  Calculating BERTScore for KL summaries...




  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...





Processing language: fr (8 documents)
  Calculating BERTScore for KL summaries...
  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...





Processing language: zh-cn (6 documents)
  Calculating BERTScore for KL summaries...
  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...

Processing language: es (2 documents)
  Calculating BERTScore for KL summaries...
  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...

Processing language: nl (1 documents)
  Calculating BERTScore for KL summaries...
  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...

Processing language: pt (3 documents)
  Calculating BERTScore for KL summaries...
  Calculating BERTScore for weighted TextRank summaries...
  Calculating BERTScore for TextRank summaries...
  Calculating BERTScore for baseline summaries...

Processing language: de (5 d

In [16]:
# Add BERTScore columns to dataframe
merged_df['bertscore_precision_kl'] = P_kl
merged_df['bertscore_recall_kl'] = R_kl
merged_df['bertscore_f1_kl'] = F1_kl

merged_df['bertscore_precision_tr'] = P_wtr
merged_df['bertscore_recall_tr'] = R_wtr
merged_df['bertscore_f1_tr'] = F1_wtr

merged_df['bertscore_precision_wtr'] = P_wtr
merged_df['bertscore_recall_wtr'] = R_wtr
merged_df['bertscore_f1_wtr'] = F1_wtr

merged_df['bertscore_precision_baseline'] = P_baseline
merged_df['bertscore_recall_baseline'] = R_baseline
merged_df['bertscore_f1_baseline'] = F1_baseline

print("BERTScore columns added to dataframe")

BERTScore columns added to dataframe


## Results

In [17]:
# Print overall mean scores
print("="*60)
print("OVERALL BERTSCORE MEANS")
print("="*60)
print(f"KL Summary - P: {merged_df['bertscore_precision_kl'].mean():.4f}, R: {merged_df['bertscore_recall_kl'].mean():.4f}, F1: {merged_df['bertscore_f1_kl'].mean():.4f}")
print(f"TR Summary - P: {merged_df['bertscore_precision_tr'].mean():.4f}, R: {merged_df['bertscore_recall_tr'].mean():.4f}, F1: {merged_df['bertscore_f1_tr'].mean():.4f}")
print(f"WTR Summary - P: {merged_df['bertscore_precision_wtr'].mean():.4f}, R: {merged_df['bertscore_recall_wtr'].mean():.4f}, F1: {merged_df['bertscore_f1_wtr'].mean():.4f}")
print(f"Baseline Summary - P: {merged_df['bertscore_precision_baseline'].mean():.4f}, R: {merged_df['bertscore_recall_baseline'].mean():.4f}, F1: {merged_df['bertscore_f1_baseline'].mean():.4f}")

OVERALL BERTSCORE MEANS
KL Summary - P: 0.6432, R: 0.6684, F1: 0.6546
TR Summary - P: 0.6557, R: 0.6366, F1: 0.6451
WTR Summary - P: 0.6557, R: 0.6366, F1: 0.6451
Baseline Summary - P: 0.6447, R: 0.6349, F1: 0.6387


In [18]:
# Split by content length (same as rouge_eval)
merged_df['content_length'] = merged_df['markdown_content'].str.len()

q33 = merged_df['content_length'].quantile(0.33)
q67 = merged_df['content_length'].quantile(0.67)

short_df = merged_df[merged_df['content_length'] <= q33].copy()
medium_df = merged_df[(merged_df['content_length'] > q33) & (merged_df['content_length'] <= q67)].copy()
long_df = merged_df[merged_df['content_length'] > q67].copy()

In [19]:
# Print scores by document length
print("\n" + "="*60)
print("SHORT DOCUMENTS - BERTSCORE")
print("="*60)
print(f"KL Summary - P: {short_df['bertscore_precision_kl'].mean():.4f}, R: {short_df['bertscore_recall_kl'].mean():.4f}, F1: {short_df['bertscore_f1_kl'].mean():.4f}")
print(f"TR Summary - P: {short_df['bertscore_precision_tr'].mean():.4f}, R: {short_df['bertscore_recall_tr'].mean():.4f}, F1: {short_df['bertscore_f1_tr'].mean():.4f}")
print(f"WTR Summary - P: {short_df['bertscore_precision_wtr'].mean():.4f}, R: {short_df['bertscore_recall_wtr'].mean():.4f}, F1: {short_df['bertscore_f1_wtr'].mean():.4f}")
print(f"Baseline Summary - P: {short_df['bertscore_precision_baseline'].mean():.4f}, R: {short_df['bertscore_recall_baseline'].mean():.4f}, F1: {short_df['bertscore_f1_baseline'].mean():.4f}")

print("\n" + "="*60)
print("MEDIUM DOCUMENTS - BERTSCORE")
print("="*60)
print(f"KL Summary - P: {medium_df['bertscore_precision_kl'].mean():.4f}, R: {medium_df['bertscore_recall_kl'].mean():.4f}, F1: {medium_df['bertscore_f1_kl'].mean():.4f}")
print(f"TR Summary - P: {medium_df['bertscore_precision_tr'].mean():.4f}, R: {medium_df['bertscore_recall_tr'].mean():.4f}, F1: {medium_df['bertscore_f1_tr'].mean():.4f}")
print(f"WTR Summary - P: {medium_df['bertscore_precision_wtr'].mean():.4f}, R: {medium_df['bertscore_recall_wtr'].mean():.4f}, F1: {medium_df['bertscore_f1_wtr'].mean():.4f}")
print(f"Baseline Summary - P: {medium_df['bertscore_precision_baseline'].mean():.4f}, R: {medium_df['bertscore_recall_baseline'].mean():.4f}, F1: {medium_df['bertscore_f1_baseline'].mean():.4f}")

print("\n" + "="*60)
print("LONG DOCUMENTS - BERTSCORE")
print("="*60)
print(f"KL Summary - P: {long_df['bertscore_precision_kl'].mean():.4f}, R: {long_df['bertscore_recall_kl'].mean():.4f}, F1: {long_df['bertscore_f1_kl'].mean():.4f}")
print(f"TR Summary - P: {long_df['bertscore_precision_tr'].mean():.4f}, R: {long_df['bertscore_recall_tr'].mean():.4f}, F1: {long_df['bertscore_f1_tr'].mean():.4f}")
print(f"WTR Summary - P: {long_df['bertscore_precision_wtr'].mean():.4f}, R: {long_df['bertscore_recall_wtr'].mean():.4f}, F1: {long_df['bertscore_f1_wtr'].mean():.4f}")
print(f"Baseline Summary - P: {long_df['bertscore_precision_baseline'].mean():.4f}, R: {long_df['bertscore_recall_baseline'].mean():.4f}, F1: {long_df['bertscore_f1_baseline'].mean():.4f}")


SHORT DOCUMENTS - BERTSCORE
KL Summary - P: 0.6878, R: 0.7251, F1: 0.7052
TR Summary - P: 0.6950, R: 0.6894, F1: 0.6914
WTR Summary - P: 0.6950, R: 0.6894, F1: 0.6914
Baseline Summary - P: 0.6766, R: 0.6814, F1: 0.6776

MEDIUM DOCUMENTS - BERTSCORE
KL Summary - P: 0.6541, R: 0.6806, F1: 0.6663
TR Summary - P: 0.6645, R: 0.6381, F1: 0.6503
WTR Summary - P: 0.6645, R: 0.6381, F1: 0.6503
Baseline Summary - P: 0.6587, R: 0.6403, F1: 0.6485

LONG DOCUMENTS - BERTSCORE
KL Summary - P: 0.5872, R: 0.5991, F1: 0.5920
TR Summary - P: 0.6072, R: 0.5823, F1: 0.5936
WTR Summary - P: 0.6072, R: 0.5823, F1: 0.5936
Baseline Summary - P: 0.5983, R: 0.5829, F1: 0.5896
