Preprocessing of SummEval and News Room datasets.

- extract original articles.
- extract summaries.
- extract ID's for articles and summaries.
- extract summary scores from human experts.

In [3]:
import sys; sys.path.append("../utils")
from constants import *
import pandas as pd

### SummEval

In [14]:
path = f"{gdrive_path}/data/summeval"
df = pd.read_json(f"{path}/model_annotations.aligned.paired.jsonl", orient="records", lines=True)

out = pd.DataFrame(columns=["article_id", "model_id", "article", "summary", "coherence", "consistency", "fluency", "relevance"])
for article_id in df["id"].unique():
    summaries = df.loc[df["id"] == article_id, :]
    assert summaries["model_id"].nunique() == 16
    assert summaries["text"].nunique() == 1
    for model_id in summaries["model_id"].unique():
        ratings = summaries.loc[summaries["model_id"] == model_id, "expert_annotations"].item()
        assert len(ratings) == 3
        row = [article_id, model_id, summaries["text"].iloc[0]]
        row.append(summaries.loc[summaries["model_id"] == model_id, "decoded"].item())
        for col in ["coherence", "consistency", "fluency", "relevance"]:
            scores = [r[col] for r in ratings]
            avg_score = sum(scores) / len(scores)
            row.append(avg_score)
        out.loc[len(out)] = row

out.rename(columns={"model_id": "summary_id"}, inplace=True)
out.to_json(f"{gdrive_path}/data/summeval-processed.jsonl", orient="records", lines=True)

In [15]:
out.head(1)

Unnamed: 0,article_id,summary_id,article,summary,coherence,consistency,fluency,relevance
0,dm-test-8764fb95bfad8ee849274873a92fb8d6b400eee2,M11,Paul Merson has restarted his row with Andros ...,paul merson was brought on with only seven min...,1.333333,1.0,3.0,1.666667


### News Room

In [16]:
path = f"{gdrive_path}/data/newsroom"
df = pd.read_csv(f"{path}/newsroom-human-eval.csv")
df.drop(columns=["ArticleTitle"], inplace=True)
column_mapping = {
    "ArticleID": "article_id",
    "System": "system_id",
    "ArticleText": "article",
    "SystemSummary": "summary",
    "CoherenceRating": "coherence",
    "FluencyRating": "fluency",
    "InformativenessRating": "consistency",
    "RelevanceRating": "relevance"
}
df.rename(columns=column_mapping, inplace=True)
df["system_id"] = df.system_id.factorize()[0]

out = pd.DataFrame(columns=df.columns)
for article in df.article_id.unique():
    for system in df.system_id.unique():
        ratings = df.loc[(df.article_id == article)&(df.system_id == system), :]
        # sanity checks
        assert len(ratings) == 3
        assert ratings["article"].nunique() == 1
        assert ratings["summary"].nunique() == 1
        row = ratings.iloc[0].copy()
        for col in ["coherence", "fluency", "consistency", "relevance"]:
            row[col] = ratings[col].mean()
        out.loc[len(out)] = row

out.rename(columns={"system_id": "summary_id"}, inplace=True)
out.to_json(f"{gdrive_path}/data/newsroom-processed.jsonl", orient="records", lines=True)

In [17]:
out.head(1)

Unnamed: 0,article_id,summary_id,article,summary,coherence,fluency,consistency,relevance
0,2140,0,A worker sets up a polling station the morning...,John Avlon voter turnout in the is a sign of a...,2.666667,3.333333,2.666667,3.0
