In [None]:
import pandas as pd

In [None]:
# Download the Yelp dataset released by Brazinskas et al. (2020).
# The dataset contains human-written summaries generated by Amazon Mechanical Turk (AMT) workers,
# who summaried 8 reviews per business. Each business has 3 generated summaries by 3 workers, one summary per worker.

! mkdir ./data/gold_summs
! curl "https://raw.githubusercontent.com/abrazinskas/FewSum/master/artifacts/yelp/gold_summs/train.csv" --output "./data/gold_summs/train.csv"
! curl "https://raw.githubusercontent.com/abrazinskas/FewSum/master/artifacts/yelp/gold_summs/val.csv" --output "./data/gold_summs/val.csv"
! curl "https://raw.githubusercontent.com/abrazinskas/FewSum/master/artifacts/yelp/gold_summs/test.csv" --output "./data/gold_summs/test.csv"

In [None]:
train = pd.read_csv("./data/gold_summs/train.csv", sep="\t")
train.head()

In [None]:
train.shape

In [None]:
train["group_id"].nunique()

In [None]:
val = pd.read_csv("./data/gold_summs/val.csv", sep="\t")
val.head()

In [None]:
val.shape

In [None]:
val["group_id"].nunique()

In [None]:
test = pd.read_csv("./data/gold_summs/test.csv", sep="\t")
test.head()

In [None]:
test.shape

In [None]:
test["group_id"].nunique()

## Create Eval Dataset

Merge train/val/test splits to create an evaluation dataset.


In [None]:
eval_df = pd.concat([train, val, test])
eval_df.drop(columns=["cat"], inplace=True)
eval_df.rename(columns={"group_id": "business_id"}, inplace=True)
eval_df.head()

In [None]:
eval_df.shape

In [None]:
eval_df["business_id"].nunique()

In [None]:
reviews = []
for _, row in eval_df.iterrows():
    reviews.extend(
        [
            row["rev1"],
            row["rev2"],
            row["rev3"],
            row["rev4"],
            row["rev5"],
            row["rev6"],
            row["rev7"],
            row["rev8"],
        ]
    )
len(reviews)

In [None]:
# Review # words stats
pd.DataFrame(reviews)[0].apply(lambda x: len(x.split())).describe()

In [None]:
summaries = []
for _, row in eval_df.iterrows():
    summaries.extend([row["summ1"], row["summ2"], row["summ3"]])
len(summaries)

In [None]:
# Summary # words stats
pd.DataFrame(summaries)[0].apply(lambda x: len(x.split())).describe()

In [None]:
eval_dataset = dict()
for _, row in eval_df.iterrows():
    eval_dataset[row["business_id"]] = {
        "reviews": [
            row["rev1"],
            row["rev2"],
            row["rev3"],
            row["rev4"],
            row["rev5"],
            row["rev6"],
            row["rev7"],
            row["rev8"],
        ],
        "summaries": [row["summ1"], row["summ2"], row["summ3"]],
    }

In [None]:
print("Reviews:")
for review in eval_dataset[list(eval_dataset.keys())[0]]["reviews"]:
    print("  ", review, "\n")


print("\n------\nSummaries")
for summ in eval_dataset[list(eval_dataset.keys())[0]]["summaries"]:
    print("  ", summ, "\n")