In [1]:
from pathlib import Path
import pandas as pd
from factue.utils.vars import project_root

base_dir = project_root / "data/raw/persuasion/trial_data/TRIAL-ANNOTATED"
langs = ["PL", "RU", "BG", "SI"]

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    subtask1_path = lang_path / "subtask-1-annotations.txt"
    subtask2_path = lang_path / "subtask-2-annotations.txt"

    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8")
        for f in raw_docs_path.glob("*.txt")
    }

    # Load subtask-2 annotations into a dict
    subtask2_map = {}
    if subtask2_path.exists():
        with subtask2_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 4:
                    fname, start, end, *labels = parts
                    key = (fname, int(start), int(end))
                    subtask2_map[key] = labels

    # Parse subtask-1 annotations and extract spans
    with subtask1_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 4:
                continue
            fname, start, end, label_bin = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]
            labels_multi = subtask2_map.get((fname, start, end), [])

            records.append({
                "lang": lang,
                "filename": fname,
                "start": start,
                "end": end,
                "text": text,
                "label_bin": label_bin.lower() == "true",
                "labels_multi": labels_multi,

            })

# Create a DataFrame
df = pd.DataFrame(records)

from pathlib import Path

batch_size = 20
output_base = project_root / "data/preprocessed/persuasion/train"
output_base.mkdir(parents=True, exist_ok=True)

for lang in df["lang"].unique():
    df_lang = df[df["lang"] == lang].reset_index(drop=True)
    lang_dir = output_base / f"train-{lang.lower()}"
    lang_dir.mkdir(parents=True, exist_ok=True)

    # Save in batches of 20
    for i in range(0, len(df_lang), batch_size):
        batch_df = df_lang.iloc[i:i + batch_size]
        batch_index = i // batch_size
        batch_filename = f"batch{batch_index:04d}.parquet"
        batch_path = lang_dir / batch_filename
        batch_df.to_parquet(batch_path, index=False)
        print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-ru/batch0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-bg/batch0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-bg/batch0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-bg/batch0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-bg/batch0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-si/batch0000.parquet


In [2]:
df.value_counts('lang')

lang
BG    75
PL    27
RU    18
SI     9
Name: count, dtype: int64

In [None]:
from collections import Counter

# Flatten all labels_multi lists into one list
all_labels = df['labels_multi'].explode()

# Drop None or NaN values (in case some entries have empty label lists)
all_labels = all_labels.dropna()

# Count occurrences
label_counts = Counter(all_labels)

# Convert to DataFrame for display or saving
label_counts_df = pd.DataFrame(label_counts.items(), columns=["label", "count"]).sort_values("count", ascending=False)

print(label_counts_df)