In [9]:
import pandas as pd
import json
import os
import re

## PAN23 data

In [10]:
def get_dataframe(task, set_type):
    path = f"./pan23-data/pan23-multi-author-analysis-dataset{task}/pan23-multi-author-analysis-dataset{task}-{set_type}/"
    filenames = os.listdir(path)

    lines_by_id = dict()
    truths_by_id = dict()

    for filename in filenames:
        is_truth = filename.startswith("truth")
        problem_id = re.search(r"\d+", filename).group(0)
        filepath = os.path.join(path, filename)

        with open(filepath) as f:
            if is_truth is True:
                truths_by_id[problem_id] = json.load(f)["changes"]
            else:
                lines_by_id[problem_id] = f.readlines()

    df_dict = dict(id=[], text1=[], text2=[], label=[])
    for problem_id in lines_by_id.keys():
        lines = lines_by_id[problem_id]
        truths = truths_by_id[problem_id]
        for text1, text2, label in zip(lines, lines[1:], truths):
            df_dict["id"].append(problem_id)
            df_dict["text1"].append(text1)
            df_dict["text2"].append(text2)
            df_dict["label"].append(label)

    return pd.DataFrame.from_dict(df_dict)

In [11]:
for task in range(1, 4):
    for set_type in ["train", "validation"]:
        df = get_dataframe(task, set_type)
        df.to_csv(f"./data/pan23-task{task}-{set_type}.csv", index=False)

## Blog data

In [17]:
df = pd.read_csv(
    "blogtext/blogtext.csv",
    usecols=["id", "text"],
    dtype={"id": "category", "text": "string"},
)

In [18]:
df = df[df["text"].str.len() >= 200]

In [35]:
# remove ids with counts less than 2
df = df.groupby("id").filter(lambda x: len(x) >= 2)

  df = df.groupby("id").filter(lambda x: len(x) >= 2)


In [40]:
df_dict = dict()
for author_id, text in zip(df["id"], df["text"]):
    df_dict.setdefault(author_id, []).append(text)

In [41]:
for author_ind, (author_id, texts) in enumerate(df_dict.items()):
    author_path = f"./blogtext/{author_ind}/"
    os.makedirs(os.path.dirname(author_path), exist_ok=True)

    for text_ind, text in enumerate(texts):
        text_path = f"./blogtext/{author_ind}/{text_ind}.txt"
        with open(text_path, "w") as f:
            f.write(text)