In [74]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.util import ngrams
from collections import Counter
import numpy as np
from bert_score import score as bert_score
from bleurt import score as bleurt_score
from textstat import textstat
from transformers import logging as transformers_logging
from openai import OpenAI
import os

### Part 1: Let's start with simple n-gram matching

In [43]:
complex_text = "The consumption of excessive amounts of carbohydrates may lead to an elevation in blood glucose levels."
simple_text = "Eating too much sugar can raise blood sugar."
reference = ["Eating too many carbs can increase blood sugar levels."]

def demonstrate_ngram_matching(complex_text, simple_text):
    """
    Demonstrate basic n-gram matching to show its limitations
    """
    # Tokenize texts
    complex_tokens = word_tokenize(complex_text.lower())
    simple_tokens = word_tokenize(simple_text.lower())
    
    # Get bigrams
    complex_bigrams = set(ngrams(complex_tokens, 2))
    simple_bigrams = set(ngrams(simple_tokens, 2))
    
    # Calculate simple overlap
    overlap = len(complex_bigrams & simple_bigrams)
    total = len(complex_bigrams | simple_bigrams)
    
    print("Simple N-gram Analysis:")
    print(f"Complex text: {complex_text}")
    print(f"Simple text: {simple_text}")
    print(f"Bigram overlap ratio: {overlap/total:.3f}")
    print("\nWhy this is problematic:")
    print("- Doesn't account for valid simplification operations")
    print("- Penalizes good simplifications that use different words")
    print("- No consideration of meaning preservation")

demonstrate_ngram_matching(complex_text, simple_text)

Simple N-gram Analysis:
Complex text: The consumption of excessive amounts of carbohydrates may lead to an elevation in blood glucose levels.
Simple text: Eating too much sugar can raise blood sugar.
Bigram overlap ratio: 0.000

Why this is problematic:
- Doesn't account for valid simplification operations
- Penalizes good simplifications that use different words
- No consideration of meaning preservation


### Part 2: Moving to BLEU - a more sophisticated n-gram based metric

In [12]:
def calculate_bleu(complex_text, simple_text):
    """
    Calculate BLEU score to show its limitations for simplification
    """
    reference = word_tokenize(complex_text.lower())
    candidate = word_tokenize(simple_text.lower())
    
    # Calculate BLEU with different n-gram settings
    bleu_1 = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0),
                          smoothing_function=SmoothingFunction().method1)
    bleu_2 = sentence_bleu([reference], candidate, weights=(0.5, 0.5, 0, 0),
                          smoothing_function=SmoothingFunction().method1)
    
    print("\nBLEU Score Analysis:")
    print(f"BLEU-1: {bleu_1:.3f}")
    print(f"BLEU-2: {bleu_2:.3f}")
    print("\nLimitations for Simplification:")
    print("- Designed for translation, not simplification")
    print("- Expects high n-gram overlap")
    print("- Can't handle valid structural changes")

calculate_bleu(complex_text, simple_text)


BLEU Score Analysis:
BLEU-1: 0.091
BLEU-2: 0.022

Limitations for Simplification:
- Designed for translation, not simplification
- Expects high n-gram overlap
- Can't handle valid structural changes


### Part 3: Introducing SARI - designed specifically for simplification

In [104]:
def calculate_f1(matches, system_count, reference_count):
    # Calculate F1 score by comparing system output against reference matches
    if system_count + reference_count == 0:  # Nothing to compare
        return 1.0
    return 2 * matches / (system_count + reference_count)

def calculate_sari(orig, simp, refs):
    # Convert to word sets
    orig_words = set(word_tokenize(orig.lower()))
    simp_words = set(word_tokenize(simp.lower()))
    ref_words = [set(word_tokenize(ref.lower())) for ref in refs]
    
    # Words that were added 
    system_added = simp_words - orig_words
    reference_added = set.union(*[ref - orig_words for ref in ref_words])
    add_score = calculate_f1(
        len(system_added & reference_added),  # Correct additions
        len(system_added),                    # System additions
        len(reference_added)                  # Reference additions
    )
    
    # Words that were kept
    system_kept = orig_words & simp_words  
    reference_kept = set.union(*[orig_words & ref for ref in ref_words])
    keep_score = calculate_f1(
        len(system_kept & reference_kept),
        len(system_kept),
        len(reference_kept)
    )
    
    # Words that were deleted
    system_deleted = orig_words - simp_words
    reference_deleted = set.union(*[orig_words - ref for ref in ref_words])
    delete_score = calculate_f1(
        len(system_deleted & reference_deleted),
        len(system_deleted),
        len(reference_deleted)
    )
    
    # Average the three scores
    sari = (add_score + keep_score + delete_score) / 3.0
    return sari, add_score, keep_score, delete_score

# Let's see how SARI evaluates our example
sari, add_f1, keep_f1, del_f1 = calculate_sari(complex_text, simple_text, reference)
print("\nSARI Analysis:")
print(f"SARI Score: {sari:.3f}")
print(f"Add F1: {add_f1:.3f} (captures good additions)")
print(f"Keep F1: {keep_f1:.3f} (captures preserving important content)")
print(f"Delete F1: {del_f1:.3f} (captures good deletions)")
print("\nAdvantages:")
print("- Designed specifically for simplification")
print("- Considers addition, deletion, and keeping operations")
print("- Better aligned with simplification goals")


SARI Analysis:
SARI Score: 0.441
Add F1: 0.000 (captures good additions)
Keep F1: 0.400 (captures preserving important content)
Delete F1: 0.923 (captures good deletions)

Advantages:
- Designed specifically for simplification
- Considers addition, deletion, and keeping operations
- Better aligned with simplification goals


### Part 4: Modern approach - BERTScore for semantic similarity

In [48]:
def calculate_bertscore(complex_text, simple_text):
    """
    Calculate BERTScore components for semantic similarity.

    Args:
        complex_text (str): The original complex text.
        simple_text (str): The simplified text.

    Returns:
        tuple: Precision (P), Recall (R), and F1 score as floats.
    """
    P, R, F1 = bert_score([simple_text], [complex_text], lang='en', verbose=False)

    return P[0].item(), R[0].item(), F1[0].item()

# Let's see how BERTScore evaluates our example
precision, recall, f1 = calculate_bertscore(complex_text, simple_text)
print("\nBERTScore Analysis:")
print(f"Precision: {precision:.3f} (How accurate is the simplified content?)")
print(f"Recall: {recall:.3f} (How much original meaning is retained?)")
print(f"F1: {f1:.3f} (Overall semantic similarity)")
print("\nWhat these scores mean for simplification:")
print("- High precision: simplified text is accurate/faithful")
print("- High recall: maintains important information from original")
print("- High F1: good balance between simplicity and meaning preservation")
print("\nAdvantages:")
print("- Captures semantic similarity at word/phrase level")
print("- Less dependent on exact word matches")
print("- Can handle paraphrasing and restructuring")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore Analysis:
Precision: 0.931 (How accurate is the simplified content?)
Recall: 0.910 (How much original meaning is retained?)
F1: 0.920 (Overall semantic similarity)

What these scores mean for simplification:
- High precision: simplified text is accurate/faithful
- High recall: maintains important information from original
- High F1: good balance between simplicity and meaning preservation

Advantages:
- Captures semantic similarity at word/phrase level
- Less dependent on exact word matches
- Can handle paraphrasing and restructuring


Note: When running BERTScore, you'll see a warning about RoBERTa model weights.
This warning appears because BERTScore uses the RoBERTa model for semantic similarity, but doesn't need the pooler layer that's typically used for classification tasks.
The warning is normal and doesn't affect our similarity calculations.

### Part 5: BLEURT - Trained with human judgments

You need to point to the downloaded checkpoint directory
Use the path to where you extracted the zip file

In [54]:
def calculate_bleurt(complex_text, simple_text):
    """
    Calculate BLEURT score - learned metric based on human judgments
    """
    checkpoint = "BLEURT-20"
    scorer = bleurt_score.BleurtScorer(checkpoint)
    scores = scorer.score(references=[complex_text], candidates=[simple_text])
    return scores[0]

bleurt_scores = calculate_bleurt(complex_text, simple_text)
print(f"\nBLEURT Score: {bleurt_scores:.3f}")


INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: BLEURT-20\sent_piece.model.


INFO:tensorflow:Will load model: BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.



BLEURT Score: 0.708


### Part 6: G-eval

This step will require an OpenAI API-key, which you can fill in below in the first row.

In [101]:
os.environ["OPENAI_API_KEY"] = "Your-key-here"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [103]:
def calculate_geval(complex_text, simple_text, metric="Relevance"):
    """
    Calculate G-eval score for a single metric (Relevance, Coherence, Consistency, or Fluency)
    Args:
        complex_text: original complex sentence
        simple_text: simplified version
        metric: which metric to evaluate (default: Relevance)
    Returns:
        score: integer between 1-5
    """
    client = OpenAI()
    
    # Metric definitions
    metrics = {
        "Relevance": (
            "Relevance(1-5) - How well the simplified version maintains the important content from the complex sentence.",
            "1. Read both versions carefully.\n2. Check if all key information is preserved.\n3. Assign score 1-5 (1:most info missing, 5:all info preserved)"
        ),
        "Coherence": (
            "Coherence(1-5) - How well-structured and logical the simplified sentence is.",
            "1. Read both versions carefully.\n2. Check sentence structure and flow.\n3. Assign score 1-5 (1:very poor structure, 5:perfectly clear)"
        ),
        "Consistency": (
            "Consistency(1-5) - The factual alignment between the complex and simplified sentences.",
            "1. Read both versions carefully.\n2. Check for factual errors.\n3. Assign score 1-5 (1:major errors, 5:perfect alignment)"
        ),
        "Fluency": (
            "Fluency(1-5) - The linguistic quality in terms of grammar, clarity, and readability.",
            "1. Read the simplified version.\n2. Check grammar and readability.\n3. Assign score 1-5 (1:poor grammar, 5:perfectly formed)"
        )
    }
    
    criteria, steps = metrics[metric]
    
    prompt = f"""You will be given an original complex sentence and a simplified version. Rate the simplified version on one metric.

Evaluation Criteria:
{criteria}

Evaluation Steps:
{steps}

Source Text:
{complex_text}

Summary:
{simple_text}

Evaluation Form (scores ONLY):
- {metric}"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5
    )
    
    # Extract and clean score
    score = response.choices[0].message.content.replace(':', '').strip()
    score = int(''.join(filter(str.isdigit, score)))
    return max(1, min(5, score))  # Ensure score is between 1-5

# Example usage:
complex_text = "The implementation of the Affordable Care Act, commonly known as Obamacare, has led to significant changes in the healthcare system of the United States."
simple_text = "Obamacare has changed US healthcare a lot."

for metric in ["Relevance", "Coherence", "Consistency", "Fluency"]:
    score = calculate_geval(complex_text, simple_text, metric)
    print(f"{metric} Score: {score}")

Relevance Score: 5
Coherence Score: 5
Consistency Score: 5
Fluency Score: 5


### Finally, let's apply all metrics to our actual data

In [88]:
def process_excel_file(file_path):
    """
    Process Excel file and compute metrics with sentence tracking:
    - SARI (with add, keep, delete components)
    - BLEU (bigram-based)
    - BERTScore (precision, recall, F1)
    - BLEURT (learned metric)
    """
    # Initialize BLEURT scorer (do this once, outside the loop)
    checkpoint = "BLEURT-20"
    bleurt_scorer = bleurt_score.BleurtScorer(checkpoint)
    df = pd.read_excel(file_path)
    results = []

    # Silence the weight initialization warning.
    transformers_logging.set_verbosity(transformers_logging.CRITICAL)
    
    for idx, row in df.iterrows():
        # Ensure generated is the first version
        versions = {
            'generated': row['generated_simple'],
            'fact_reversal': row['fact_reversal'],
            'info_omission': row['info_omission'],
            'unsupported_info': row['unsupported_info'],
            'subject_object_reversal': row['subject_object_reversal'],
            'partial_meaning': row['partial_meaning']
        }
        
        for version_name, current_text in versions.items():
            # Calculate all metrics
            sari, add_f1, keep_f1, del_f1 = calculate_sari(
                row['complex'],
                current_text,
                [row['original_simple']]
            )
            
            # 2. BLEU score (bigram-based)
            bleu = sentence_bleu(
                [word_tokenize(row['original_simple'])],
                word_tokenize(current_text),
                weights=(0.5, 0.5),
                smoothing_function=SmoothingFunction().method1
            )
            
            # 3. BERTScore components
            P, R, F1 = bert_score([current_text], [row['original_simple']], lang='en', verbose=False)
            bert_precision, bert_recall, bert_f1 = P.numpy()[0], R.numpy()[0], F1.numpy()[0]
            
            # 4. BLEURT score
            bleurt = bleurt_scorer.score(
                references=[row['original_simple']],
                candidates=[current_text]
            )[0]
            
            results.append({
                'sentence_id': idx + 1,
                'version': version_name,
                'current_version_text': current_text,  # The text of whichever version we're currently evaluating
                'complex': row['complex'],            # Original complex sentence
                'original_simple': row['original_simple'],  # The human reference simplification
                'generated_simple': row['generated_simple'],  # The model's output
                'sari': sari,
                'add_f1': add_f1,
                'keep_f1': keep_f1,
                'del_f1': del_f1,
                'bleu': bleu,
                'bertscore_precision': bert_precision,
                'bertscore_recall': bert_recall,
                'bertscore_f1': bert_f1,
                'bleurt': bleurt
            })
    
    results_df = pd.DataFrame(results)
    
    # Save to Excel file with formatting
    results_df.to_excel('metric_results.xlsx', 
                       sheet_name='Metrics Analysis',
                       index=False,
                       float_format='%.4f')
    
    return results_df

In [90]:
results = process_excel_file('results_with_alterations.xlsx')
numeric_columns = results.select_dtypes(include=['float64', 'float32', 'int64']).columns.drop('sentence_id')
print("\nAverage scores by version:")
print(results.groupby('version')[numeric_columns].mean())

INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Reading checkpoint BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: BLEURT-20\sent_piece.model.


INFO:tensorflow:Will load model: BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.



Average scores by version:
                         sari  add_f1  keep_f1  del_f1  bleu  \
version                                                        
fact_reversal           0.490   0.168    0.735   0.566 0.340   
generated               0.505   0.180    0.755   0.579 0.383   
info_omission           0.430   0.135    0.602   0.554 0.240   
partial_meaning         0.477   0.122    0.744   0.566 0.295   
subject_object_reversal 0.490   0.164    0.734   0.573 0.305   
unsupported_info        0.492   0.120    0.768   0.587 0.290   

                         bertscore_precision  bertscore_recall  bertscore_f1  \
version                                                                        
fact_reversal                          0.932             0.938         0.935   
generated                              0.943             0.948         0.946   
info_omission                          0.945             0.914         0.929   
partial_meaning                        0.924             0.

In [96]:
EVALUATION_PROMPT_TEMPLATE = """
You will be given an original complex sentence and a simplified version of that sentence. Your task is to rate the simplified version on one metric.
Please make sure you read and understand these instructions very carefully. 
Please keep this document open while reviewing, and refer to it as needed.
Please be very critical in your evaluation.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{document}

Summary:

{summary}

Evaluation Form (scores ONLY):

- {metric_name}
"""

# Metric definitions
RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - How well the simplified version maintains the important content from the complex sentence. \
The simplified version should preserve all key information from the complex sentence. \
Penalize versions that omit crucial information or add information not present in the original sentence.
"""

RELEVANCY_SCORE_STEPS = """
1. Read both the complex sentence and its simplified version carefully.
2. Identify the key information points in the complex sentence.
3. Check if all key information is preserved in the simplified version.
4. Assign a relevance score from 1 to 5, where:
   1: Most key information is missing
   2: Significant information is lost
   3: Some important information is missing
   4: Most key information is preserved
   5: All key information is perfectly preserved
"""

COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - How well-structured and logical the simplified sentence is. \
The simplified version should present information in a clear, natural order. \
The sentence should flow well and maintain logical connections between ideas.
"""

COHERENCE_SCORE_STEPS = """
1. Read both versions carefully.
2. Evaluate how well the simplified version organizes the information.
3. Check if the logical relationships between ideas are maintained.
4. Assign a coherence score from 1 to 5, where:
   1: Extremely difficult to follow
   2: Poor organization of ideas
   3: Somewhat clear but could be better structured
   4: Well-structured with minor issues
   5: Perfectly clear and logically structured
"""

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - The factual alignment between the complex and simplified sentences. \
A consistent simplification contains only statements that are entailed by the complex sentence. \
Penalize any alterations that change the meaning or introduce incorrect facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read both versions carefully.
2. Check for any factual changes or contradictions.
3. Verify that all stated relationships and facts match the complex sentence.
4. Assign a consistency score from 1 to 5, where:
   1: Major factual errors or contradictions
   2: Significant meaning changes
   3: Minor factual discrepancies
   4: Mostly factually accurate with tiny imprecisions
   5: Perfect factual alignment
"""

FLUENCY_SCORE_CRITERIA = """
Fluency(1-5) - The linguistic quality of the simplified sentence in terms of grammar, clarity, and readability.
1: Poor grammar or structure making it hard to understand
2: Notable grammatical issues but main point is comprehensible
3: Generally correct grammar with some awkward phrasing
4: Clear and well-written with minimal issues
5: Perfectly formed, clear, and natural-sounding sentence
"""

FLUENCY_SCORE_STEPS = """
1. Read the simplified version carefully.
2. Evaluate grammar, word choice, and sentence structure.
3. Consider how natural and easy to read the sentence is.
4. Assign a fluency score from 1 to 5 based on the criteria above.
"""

def get_geval_score(criteria: str, steps: str, document: str, summary: str, metric_name: str):
    """Get evaluation score for a single metric"""
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = client.chat.completions.create(
        model="gpt-4",  # Using GPT-4 for evaluation
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return response.choices[0].message.content

def add_geval_scores(results_df):
    """
    Add G-eval scores to existing results DataFrame
    """
    evaluation_metrics = {
        "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
        "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
        "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
        "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
    }
    
    enhanced_df = results_df.copy()
    
    # Add G-eval scores for each row
    for idx, row in enhanced_df.iterrows():
        # Get scores for each metric
        for metric_name, (criteria, steps) in evaluation_metrics.items():
            try:
                score = get_geval_score(
                    criteria=criteria,
                    steps=steps,
                    document=row['complex'],
                    summary=row['current_version_text'],
                    metric_name=metric_name
                )
                # Clean up the response - remove any colons and extra spaces
                score = score.replace(':', '').strip()
                # Extract just the number
                score_value = ''.join(filter(str.isdigit, score))
                # Convert to integer
                score_value = int(score_value)
                # Ensure score is within 1-5 range
                score_value = max(1, min(5, score_value))
                enhanced_df.at[idx, f'geval_{metric_name.lower()}'] = score_value
            except Exception as e:
                print(f"Warning: Invalid score format for {metric_name} at sentence {row['sentence_id']}, version {row['version']}: {score}")
                enhanced_df.at[idx, f'geval_{metric_name.lower()}'] = 3
    
    return enhanced_df

In [97]:
enhanced_results = add_geval_scores(results)

In [98]:
enhanced_results.to_excel('final_evaluation_results.xlsx', 
                        sheet_name='Complete Analysis',
                        index=False,
                        float_format='%.4f')

In [100]:
# Get all numeric columns (excluding sentence_id)
numeric_columns = [col for col in enhanced_results.select_dtypes(include=['float64', 'float32', 'int64']).columns 
                  if col != 'sentence_id']

# Calculate and print means for each version
print("\nAverage scores by version:")
print(enhanced_results.groupby('version')[numeric_columns].mean().round(4))


Average scores by version:
                         sari  add_f1  keep_f1  del_f1  bleu  \
version                                                        
fact_reversal           0.490   0.168    0.736   0.566 0.340   
generated               0.505   0.180    0.755   0.579 0.383   
info_omission           0.430   0.135    0.602   0.554 0.240   
partial_meaning         0.477   0.122    0.744   0.566 0.295   
subject_object_reversal 0.490   0.164    0.734   0.573 0.305   
unsupported_info        0.492   0.120    0.768   0.587 0.290   

                         bertscore_precision  bertscore_recall  bertscore_f1  \
version                                                                        
fact_reversal                          0.932             0.938         0.935   
generated                              0.943             0.948         0.946   
info_omission                          0.945             0.914         0.929   
partial_meaning                        0.924             0.