In [None]:
import os
import warnings
from pathlib import Path
from typing import Optional

import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

EXTERNAL = Path(os.getenv("EXTERNAL_STORAGE_DIR"))
ROOT = Path(os.getenv("ROOT"))
DATA_DIR = ROOT / "data"
PROC_DIR = DATA_DIR / "processed"
SPLIT_DIR = DATA_DIR / "splits"

PROC_DIR.mkdir(parents=True, exist_ok=True)
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

print(f"EXTERNAL_STORAGE_DIR: {EXTERNAL}")
print(f"DATA_DIR: {DATA_DIR.resolve()}")

In [None]:
ds = load_dataset("IlyaGusev/gazeta")

ds

In [None]:
ds["validation"][0]

In [None]:
splits = list(ds.keys())
features = list(ds[splits[1]].features.keys())
print(f"Feature names: {features}")
print(f"Splits: {splits}")

In [None]:
text_col, summ_col, title_col = features[:3]

In [None]:
def clean(s: Optional[pd.Series]) -> Optional[pd.Series]:
    s = s.fillna("")
    s = (
        s.str.replace("\xa0", " ", regex=False)
        .str.replace("\u2009", " ", regex=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    return s

In [None]:
for split in splits:
    df = ds[split].to_pandas()
    df["origin_idx"] = df.index
    df[title_col] = clean(df[title_col])
    df[text_col] = clean(df[text_col])
    df[summ_col] = clean(df[summ_col])
    df = df[df[text_col].str.len() > 300].reset_index(drop=True)
    out = pd.DataFrame(
        {
            "id": df["origin_idx"],
            "title": df[title_col],
            "text": df[text_col],
            "reference_summary": df[summ_col],
        }
    )

    out_split_ids = SPLIT_DIR / f"gazeta_{split}_ids.csv"
    out["id"].to_csv(out_split_ids, index=False, header=False)

    out_split_sample = SPLIT_DIR / f"gazeta_{split}_sample.jsonl"
    sample = out.sample(n=min(500, len(out)), random_state=42)
    sample.to_json(out_split_sample, orient="records", force_ascii=False, lines=True)

    out_proc = PROC_DIR / f"gazeta_{split}.jsonl"
    out.to_json(out_proc, orient="records", force_ascii=False, lines=True)

    if EXTERNAL:
        mirror = EXTERNAL / "data" / "raw"
        mirror.mkdir(parents=True, exist_ok=True)
        out.to_json(
            mirror / f"gazeta_{split}.jsonl",
            orient="records",
            force_ascii=False,
            lines=True,
        )