# Parallel Interaction on SummEval

Please see SOURCES.txt for further source information.

## Imports

In [None]:
import pandas as pd
from datasets import load_dataset
import os
from dotenv import load_dotenv
from autogen import ConversableAgent
import re
from tqdm import tqdm
import numpy as np

## Data

In [None]:
# Load and prepare a subset of the SummEval dataset
SummEval_Test = load_dataset("mteb/summeval", split="test")
df = pd.DataFrame(SummEval_Test)
problematic_indices = [5, 7, 8, 9, 10, 11, 18, 20, 26, 27, 33, 34, 39, 46, 61, 64, 68, 73, 75, 79, 85, 86, 88, 92, 96, 99]
df_filtered = df.drop(index=problematic_indices).reset_index(drop=True)
df_filtered = df_filtered[["text", "machine_summaries", "relevance", "coherence", "fluency", "consistency"]]
df_exploded = df_filtered.explode(["machine_summaries", "relevance", "coherence", "fluency", "consistency"]).reset_index(drop=True)
df_sampled = df_exploded.sample(n=100, random_state=42).reset_index(drop=True)
columns_to_round = ["relevance", "coherence", "fluency", "consistency"]
df_sampled[columns_to_round] = df_sampled[columns_to_round].astype(float).round().astype(int)
df_final = df_sampled

print(df_final.info())
print(df_final.head(1))

## Config

In [None]:
# Load Azure OpenAI configuration from environment variables
load_dotenv()

api_key = os.getenv("AZURE_OPENAI_API_KEY")
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME")
api_version = os.getenv("AZURE_API_VERSION")

# Start of code citation [1]
# The following code is adapted from the source above.

# Define the model configuration for Azure OpenAI API access
config_list = [
    {
        "model": deployment_name,
        "api_key": api_key,
        "base_url": f"{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}",
        "api_type": "azure",
        "api_version": api_version,  
        "temperature": 0,
        "cache_seed": 42,
    }
]

# End of code citation [1]

## System Design

In [None]:
# Define the system prompt for the Factual-Journalist-Agent
agent_1_system_message =f"""
You are a Factual-Journalist-Agent.
You are a professional journalist focused on verifying factual accuracy, capturing the core message of articles, and ensuring summaries remain true to the source. 
You value clear, complete, and correct information above all.

In this task you will evaluate the quality of a summary written for a news article.
To correctly solve this task, follow these steps:

    1. Carefully read the news article, be aware of the information it contains.
    2. Read the proposed summary.
    3. Rate the summary with integer values on a scale from 1 (worst) to 5 (best) by its relevance, consistency, fluency, and coherence.

Definitions:
    Relevance:
        - The rating measures how well the summary captures the key points of the article.
        - Consider whether all and only the important aspects are contained in the summary.
    Consistency:
        - The rating measures whether the facts in the summary are consistent with the facts in the original article.
        - Consider whether the summary does reproduce all facts accurately and does not make up untrue information.
    Fluency:
        - This rating measures the quality of individual sentences, are they well-written and grammatically correct.
        - Consider the quality of individual sentences.
    Coherence:
        - The rating measures the quality of all sentences collectively, to the fit together and sound naturally.
        - Consider the quality of the summary as a whole.

Give an explanation on your evaluation using about 200 words.
Always begin your output with: "As a Factual-Journalist-Agent I think ..."
Always end your output with a JSON object with the following format:{{"relevance": score, "coherence": score, "fluency": score, "consistency": score}} 
"""

# Define the system prompt for the Language-Stylist-Agent
agent_2_system_message =f"""
You are a Language-Stylist-Agent.
You are an expert in writing, grammar, and expression, evaluating how summaries communicate effectively through style, clarity, and sentence construction. 
You care about linguistic precision, elegance, and flow while maintaining content accuracy.

In this task you will evaluate the quality of a summary written for a news article.
To correctly solve this task, follow these steps:

    1. Carefully read the news article, be aware of the information it contains.
    2. Read the proposed summary.
    3. Rate the summary with integer values on a scale from 1 (worst) to 5 (best) by its relevance, consistency, fluency, and coherence.

Definitions:
    Relevance:
        - The rating measures how well the summary captures the key points of the article.
        - Consider whether all and only the important aspects are contained in the summary.
    Consistency:
        - The rating measures whether the facts in the summary are consistent with the facts in the original article.
        - Consider whether the summary does reproduce all facts accurately and does not make up untrue information.
    Fluency:
        - This rating measures the quality of individual sentences, are they well-written and grammatically correct.
        - Consider the quality of individual sentences.
    Coherence:
        - The rating measures the quality of all sentences collectively, to the fit together and sound naturally.
        - Consider the quality of the summary as a whole.

Give an explanation on your evaluation using about 200 words.
Always begin your output with: "As a Language-Stylist-Agent I think ..."
Always end your output with a JSON object with the following format:{{"relevance": score, "coherence": score, "fluency": score, "consistency": score}} 
"""

# Define the system prompt for the Cognitive-Agent
agent_3_system_message =f"""
You are a Cognitive-Agent.
You specialize in how readers process, understand, and retain information. 
You assess summaries based on logical structure, mental clarity, and how effectively the content can be followed and integrated by readers.

In this task you will evaluate the quality of a summary written for a news article.
To correctly solve this task, follow these steps:

    1. Carefully read the news article, be aware of the information it contains.
    2. Read the proposed summary.
    3. Rate the summary with integer values on a scale from 1 (worst) to 5 (best) by its relevance, consistency, fluency, and coherence.

Definitions:
    Relevance:
        - The rating measures how well the summary captures the key points of the article.
        - Consider whether all and only the important aspects are contained in the summary.
    Consistency:
        - The rating measures whether the facts in the summary are consistent with the facts in the original article.
        - Consider whether the summary does reproduce all facts accurately and does not make up untrue information.
    Fluency:
        - This rating measures the quality of individual sentences, are they well-written and grammatically correct.
        - Consider the quality of individual sentences.
    Coherence:
        - The rating measures the quality of all sentences collectively, to the fit together and sound naturally.
        - Consider the quality of the summary as a whole.

Give an explanation on your evaluation using about 200 words.
Always begin your output with: "As a Cognitive-Agent I think ..."
Always end your output with a JSON object with the following format:{{"relevance": score, "coherence": score, "fluency": score, "consistency": score}} 
"""

In [None]:
# Start of code citation [1]
# The following code is adapted from the source above.

# Initialize the ConversableAgents for system setup
initializer = ConversableAgent(
    "initializer", 
    llm_config={"config_list": config_list},
    human_input_mode="NEVER",
    )

agent_1 = ConversableAgent(
    "Factual-Journalist-Agent",
    llm_config={"config_list": config_list},
    system_message=agent_1_system_message,
    human_input_mode="NEVER",
)
agent_2 = ConversableAgent(
    "Language-Stylist-Agent",
    llm_config={"config_list": config_list},
    system_message=agent_2_system_message,
    human_input_mode="NEVER",
)
agent_3 = ConversableAgent(
    "Cognitive-Agent",
    llm_config={"config_list": config_list},
    system_message=agent_3_system_message,
    human_input_mode="NEVER",
)

# End of code citation [1]

## Evaluation

In [None]:
# Define the evaluation function
def evaluate(text, machine_summaries, relevance, coherence, fluency, consistency):
    message = f""" 
    Article: {text}

    Summary: {machine_summaries}
    """

    agents = [agent_1, agent_2, agent_3]

    scores = {
        "relevance": [],
        "coherence": [],
        "fluency": [],
        "consistency": []
    }

    for agent in agents:

        # Start of code citation [1]
        # The following code is adapted from the source above. 

        result = initializer.initiate_chat(agent, message=message, max_turns=1)

        # End of code citation [1]

        result_str = str(result)

        relevance_match = re.search(r'"relevance"\s*:\s*(\d+)', result_str)
        coherence_match = re.search(r'"coherence"\s*:\s*(\d+)', result_str)
        fluency_match = re.search(r'"fluency"\s*:\s*(\d+)', result_str)
        consistency_match = re.search(r'"consistency"\s*:\s*(\d+)', result_str)

        scores["relevance"].append(int(relevance_match.group(1)))
        scores["coherence"].append(int(coherence_match.group(1)))
        scores["fluency"].append(int(fluency_match.group(1)))
        scores["consistency"].append(int(consistency_match.group(1)))


    final_scores = {key: round(sum(vals) / len(vals)) for key, vals in scores.items()}

    return {
        "relevance": {
            "ground_truth": relevance,
            "system_decision": final_scores["relevance"],
            "deviation": final_scores["relevance"] - relevance
        },
        "coherence": {
            "ground_truth": coherence,
            "system_decision": final_scores["coherence"],
            "deviation": final_scores["coherence"] - coherence
        },
        "fluency": {
            "ground_truth": fluency,
            "system_decision": final_scores["fluency"],
            "deviation": final_scores["fluency"] - fluency
        },
        "consistency": {
            "ground_truth": consistency,
            "system_decision": final_scores["consistency"],
            "deviation": final_scores["consistency"] - consistency
        }
    }

In [None]:
# Prepare evaluation data
num_rows = 100
df_subset = df_final.head(num_rows)

# Evaluate responses
results = []
for _, row in tqdm(df_subset.iterrows(), total=num_rows, desc="Progress"):
    result = evaluate(
        text=row["text"],
        machine_summaries=row["machine_summaries"],
        relevance=row["relevance"],
        coherence=row["coherence"],
        fluency=row["fluency"],
        consistency=row["consistency"]
    )
    results.append(result)

results_df = pd.DataFrame(results)
results_df.to_csv('Results/parallel.csv', index=False)