# Results Analysis on SummEval

## Imports

In [None]:
import pandas as pd
import ast
from scipy.stats import kendalltau
import matplotlib.pyplot as plt
import numpy as np

## Analysis

In [None]:
# Define result file paths per system
files = {
    "Single": "Results/single.csv",
    "Parallel": "Results/parallel.csv",
    "Cooperative": "Results/cooperative.csv",
    "Competitive": "Results/competitive.csv"
}

# Function to parse results and compute evaluation metrics
def parse_and_evaluate(path):
    df = pd.read_csv(path)

    # Parse stringified dicts into actual dictionaries
    parsed = {dim: df[dim].apply(ast.literal_eval) for dim in df.columns}

    # Extract values per dimension
    gt = pd.DataFrame({dim: parsed[dim].apply(lambda x: x["ground_truth"]) for dim in parsed})
    sys = pd.DataFrame({dim: parsed[dim].apply(lambda x: x["system_decision"]) for dim in parsed})
    dev = pd.DataFrame({dim: parsed[dim].apply(lambda x: x["deviation"]) for dim in parsed})
    
    # Compute Kendall Tau per dimension
    kendalls = {dim: kendalltau(gt[dim], sys[dim]).correlation for dim in gt.columns}
    avg_kendall = sum(kendalls.values()) / len(kendalls)

    # Compute mean deviations and scores
    avg_dev_dim = dev.mean().to_dict()
    avg_dev_all = dev.values.flatten().mean()
    avg_gt_dim = gt.mean().to_dict()
    avg_sys_dim = sys.mean().to_dict()
    avg_gt_all = gt.values.flatten().mean()
    avg_sys_all = sys.values.flatten().mean()

    # Summary table per dimension
    summary = pd.DataFrame({
        "Kendall Tau": kendalls,
        "Deviation (avg)": avg_dev_dim,
        "Ground Truth (avg)": avg_gt_dim,
        "System (avg)": avg_sys_dim
    })

    # Overall scores
    overview = {
        "Kendall Tau (all)": avg_kendall,
        "Deviation (avg) (all)": avg_dev_all,
        "Ground Truth (avg) (all)": avg_gt_all,
        "System (avg) (all)": avg_sys_all
    }

    return summary.round(3), overview, kendalls, avg_kendall, avg_dev_dim, avg_dev_all

# Overview tables
kendall_overview = []
deviation_overview = []

# Evaluate each system
for name, path in files.items():
    summary, overview, kendalls, avg_kendall, avg_dev_dim, avg_dev_all = parse_and_evaluate(path)

    print(f"\n{name} – Table per Dimension")
    display(summary)

    print(f"\n{name} – Overall Averages")
    for k, v in overview.items():
        print(f"{k}: {v:.3f}")

    # Collect results for summary
    kendall_row = {
        "system": name,
        **{dim: round(score, 3) for dim, score in kendalls.items()},
        "average": round(avg_kendall, 3)
    }
    deviation_row = {
        "system": name,
        **{dim: round(score, 3) for dim, score in avg_dev_dim.items()},
        "average": round(avg_dev_all, 3)
    }

    kendall_overview.append(kendall_row)
    deviation_overview.append(deviation_row)

# Final summary tables
kendall_df = pd.DataFrame(kendall_overview, columns=["system", "relevance", "coherence", "fluency", "consistency", "average"])
deviation_df = pd.DataFrame(deviation_overview, columns=["system", "relevance", "coherence", "fluency", "consistency", "average"])

print("\nKendall Tau per System")
display(kendall_df)

print("\nAvg Deviation per System")
display(deviation_df)

In [None]:
# German labels for dimensions and average
dimension_labels_de = ["Relevanz", "Kohärenz", "Flüssigkeit", "Konsistenz", "Durchschnitt"]
dimensions = ["relevance", "coherence", "fluency", "consistency", "average"]

# Mapping from English to German system names
system_names_de = {
    "Single": "Einzeln",
    "Parallel": "Parallel",
    "Cooperative": "Kooperativ",
    "Competitive": "Kompetitiv"
}

# Bar positions and layout
x = np.arange(len(dimensions))
bar_width = 0.2
offsets = [-1.5, -0.5, 0.5, 1.5]

plt.figure(figsize=(10, 6))

# Plot grouped bars per system
for i, system_key in enumerate(kendall_df["system"]):
    values = kendall_df.loc[kendall_df["system"] == system_key, dimensions].values.flatten()
    positions = x + offsets[i] * bar_width
    plt.bar(positions, values, width=bar_width, label=system_names_de[system_key])

    # Add value labels above bars
    for pos, val in zip(positions, values):
        plt.text(
            pos,
            val + 0.02,
            f"{val:.2f}",
            ha="center",
            va="bottom",
            fontsize=9
        )

# Configure axes and labels
plt.xticks(ticks=x, labels=dimension_labels_de, fontsize=11)
plt.yticks(np.arange(0.0, 1.01, 0.1), fontsize=11)
plt.ylim(0.0, 1.0)
plt.ylabel("Kendall-Tau-Korrelation", fontsize=12)

plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.legend(title="System", fontsize=10, title_fontsize=11, loc="upper right")
plt.tight_layout()
plt.show()

In [None]:
# German labels for dimensions and average
dimension_labels_de = ["Relevanz", "Kohärenz", "Flüssigkeit", "Konsistenz", "Durchschnitt"]
dimensions = ["relevance", "coherence", "fluency", "consistency", "average"]

# Mapping from English system names to German
system_names_de = {
    "Single": "Einzeln",
    "Parallel": "Parallel",
    "Cooperative": "Kooperativ",
    "Competitive": "Kompetitiv"
}

# Bar positioning
x = np.arange(len(dimensions))
bar_width = 0.2
offsets = [-1.5, -0.5, 0.5, 1.5]

plt.figure(figsize=(10, 6))

# Plot bars for each system (negate values to flip direction)
for i, orig_system in enumerate(deviation_df["system"]):
    values = deviation_df.loc[deviation_df["system"] == orig_system, dimensions].values.flatten()
    values_flipped = -1 * values  # Flip for positive-up bars
    positions = x + offsets[i] * bar_width
    plt.bar(positions, values_flipped, width=bar_width, label=system_names_de[orig_system])

    # Add bar labels
    for pos, val in zip(positions, values):
        plt.text(
            pos,
            -val + 0.05,  # Position der Beschriftung etwas über dem Balken
            f"{-val:.2f}",
            ha="center",
            va="bottom",
            fontsize=9
        )

# Configure axes and layout
plt.xticks(ticks=x, labels=dimension_labels_de, fontsize=11)
plt.yticks(np.arange(0, 3.1, 0.5), fontsize=11)
plt.ylim(0, 3.0)
plt.ylabel("Negative Durchschnittliche Abweichung", fontsize=12)

plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.legend(title="System", fontsize=10, title_fontsize=11, loc="upper right")
plt.tight_layout()
plt.show()