In [5]:
import gzip
import json
import os
import pandas as pd
import requests
import shutil

def load_reddit_tifu(local_path: str = "tifu_all_tokenized_and_filtered.json"):
    posts = []
    with open(local_path, "r") as fp:
        for line in fp:
            posts.append(json.loads(line))
    return posts

def unzip_gz(filepath: str):
    with gzip.open(filepath, "rb") as f_in:
        with open(filepath.replace(".gz", ""), "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(filepath)

def download_file(url: str):
    local_filename = url.split("/")[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, "wb") as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

In [6]:
download_file(
    "https://huggingface.co/datasets/ctr4si/reddit_tifu/resolve/main/data/tifu_all_tokenized_and_filtered.json.gz"
)
unzip_gz("tifu_all_tokenized_and_filtered.json.gz")

'tifu_all_tokenized_and_filtered.json.gz?download=true'

In [17]:
posts = load_reddit_tifu()
df = pd.DataFrame.from_records(posts)

df = df[["id", "selftext_without_tldr", "tldr"]].rename(columns={"selftext_without_tldr": "source", "tldr": "target"})
df = df.sample(frac=1).reset_index(drop=True)
df_train = df[:20000]
df_test = df[-1000:]

df_train.to_csv("train.csv", index=False)
df_test.to_csv("test.csv", index=False)