In [2]:
import sys; sys.path.append("../utils")
from constants import *
import pandas as pd

In [3]:
# newsroom

path = f"{gdrive_path}/data/newsroom"
df = pd.read_csv(f"{path}/newsroom-human-eval.csv")
df.drop(columns=["ArticleTitle"], inplace=True)
column_mapping = {
    "ArticleID": "article_id",
    "System": "system_id",
    "ArticleText": "article",
    "SystemSummary": "summary",
    "CoherenceRating": "coherence",
    "FluencyRating": "fluency",
    "InformativenessRating": "consistency",
    "RelevanceRating": "relevance"
}
df.rename(columns=column_mapping, inplace=True)
df["system_id"] = df.system_id.factorize()[0]

out = pd.DataFrame(columns=df.columns)
for article in df.article_id.unique():
    for system in df.system_id.unique():
        ratings = df.loc[(df.article_id == article)&(df.system_id == system), :]
        # sanity checks
        assert len(ratings) == 3
        assert ratings["article"].nunique() == 1
        assert ratings["summary"].nunique() == 1
        row = ratings.iloc[0].copy()
        for col in ["coherence", "fluency", "consistency", "relevance"]:
            row[col] = ratings[col].mean()
        out.loc[len(out)] = row

out.to_json(f"{path}/newsroom-processed.jsonl", orient="records", lines=True)

In [4]:
# hanna

path = f"{gdrive_path}/data/hanna"
df = pd.read_csv(f"{path}/hanna_stories_annotations.csv")

out = pd.DataFrame(columns=["id", "story_prompt", "story", "coherence", "surprise", "complexity"])
for story_id in df["Story ID"].unique():
    ratings = df.loc[df["Story ID"] == story_id, :]
    # sanity checks
    assert len(ratings) == 3
    assert ratings["Prompt"].nunique() == 1
    assert ratings["Story"].nunique() == 1
    row = [ratings["Story ID"].iloc[0], ratings["Prompt"].iloc[0], ratings["Story"].iloc[0]]
    for col in ["Coherence", "Surprise", "Complexity"]:
        row.append(ratings[col].mean())
    out.loc[len(out)] = row

out.to_json(f"{path}/hanna-processed.jsonl", orient="records", lines=True)

In [5]:
# summeval

path = f"{gdrive_path}/data/summeval"
df = pd.read_json(f"{path}/model_annotations.aligned.paired.jsonl", orient="records", lines=True)

out = pd.DataFrame(columns=["article_id", "model_id", "article", "summary", "coherence", "consistency", "fluency", "relevance"])
for article_id in df["id"].unique():
    summaries = df.loc[df["id"] == article_id, :]
    assert summaries["model_id"].nunique() == 16
    assert summaries["text"].nunique() == 1
    for model_id in summaries["model_id"].unique():
        ratings = summaries.loc[summaries["model_id"] == model_id, "expert_annotations"].item()
        assert len(ratings) == 3
        row = [article_id, model_id, summaries["text"].iloc[0]]
        row.append(summaries.loc[summaries["model_id"] == model_id, "decoded"].item())
        for col in ["coherence", "consistency", "fluency", "relevance"]:
            scores = [r[col] for r in ratings]
            avg_score = sum(scores) / len(scores)
            row.append(avg_score)
        out.loc[len(out)] = row

out.to_json(f"{path}/summeval-processed.jsonl", orient="records", lines=True)