In [1]:
wd = '/Users/marcinsawinski/Documents/GitHub/factue-task2'
import sys, os
os.chdir(wd)

# alt splits

In [2]:
# import pandas as pd
# alt_splits = pd.read_parquet('/Users/marcinsawinski/Documents/GitHub/factue-task2/data/raw/persuasion/alt_splits.parquet')
# alt_splits

# Trail

In [3]:
from pathlib import Path
import pandas as pd
from factue.utils.vars import PROJECT_ROOT
import numpy as np

base_dir = PROJECT_ROOT / "data/raw/persuasion/trial_data/TRIAL-ANNOTATED"
langs = ["PL", "RU", "BG", "SI"]

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    subtask1_path = lang_path / "subtask-1-annotations.txt"
    subtask2_path = lang_path / "subtask-2-annotations.txt"

    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8") for f in raw_docs_path.glob("*.txt")
    }

    # Load subtask-2 annotations into a dict
    subtask2_map = {}
    if subtask2_path.exists():
        with subtask2_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 4:
                    fname, start, end, *labels = parts
                    key = (fname, int(start), int(end))
                    subtask2_map[key] = labels

    # Parse subtask-1 annotations and extract spans
    with subtask1_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 4:
                continue
            fname, start, end, label_bin = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]
            labels_multi = subtask2_map.get((fname, start, end), [])

            records.append(
                {
                    "filename": fname,
                    "start": start,
                    "end": end,
                    "text_lang": lang,
                    "text": text,
                    "label_bin": label_bin.lower() == "true",
                    "label_multi": labels_multi,
                }
            )

# Create a DataFrame
trial = pd.DataFrame(records)
trial['base_split'] = 'trial'


# Train

In [4]:
from pathlib import Path
import pandas as pd
from collections import defaultdict
from factue.utils.vars import PROJECT_ROOT

base_dir = PROJECT_ROOT / "data/raw/persuasion/train_data"
langs = ["PL", "RU", "BG", "SI"]

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    subtask1_path = lang_path / "subtask-1-annotations.txt"
    subtask2_path = lang_path / "subtask-2-annotations.txt"
    subtask2_subspans_path = lang_path / "subtask-2-spans-annotations.txt"

    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8") for f in raw_docs_path.glob("*.txt")
    }

    # Load subtask-2 annotations into a dict
    subtask2_map = {}
    if subtask2_path.exists():
        with subtask2_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 4:
                    fname, start, end, *labels = parts
                    key = (fname, int(start), int(end))
                    subtask2_map[key] = labels

    # Load subspans
    subspan_data = defaultdict(list)
    if subtask2_subspans_path.exists():
        with subtask2_subspans_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) == 5:
                    fname, sub_start, sub_end, label, text = parts
                    subspan_data[fname].append({
                        "start": int(sub_start),
                        "end": int(sub_end),
                        "label": label,
                        "text": text
                    })

    # Parse subtask-1 annotations and extract spans
    with subtask1_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 4:
                continue
            fname, start, end, label_bin = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]
            labels_multi = subtask2_map.get((fname, start, end), [])

            # Match subspans
            subspans_within = defaultdict(list)
            for sub in subspan_data.get(fname, []):
                if start <= sub["start"] and sub["end"] <= end:
                    subspans_within[sub["label"]].append(sub["text"])

            records.append({
                "filename": fname,
                "start": start,
                "end": end,
                "text_lang": lang,
                "text": text,
                "label_bin": label_bin.lower() == "true",
                "label_multi": labels_multi,
                **subspans_within  # this spreads the subspan labels as keys with list of texts as values
            })

# Create a DataFrame
train = pd.DataFrame(records)
train['base_split'] = 'train'

In [5]:
train

Unnamed: 0,filename,start,end,text_lang,text,label_bin,label_multi,Questioning_the_Reputation,Name_Calling-Labeling,Appeal_to_Hypocrisy,...,Straw_Man,Slogans,Appeal_to_Fear-Prejudice,Causal_Oversimplification,Appeal_to_Authority,False_Dilemma-No_Choice,Whataboutism,False_Equivalence,Guilt_by_Association,base_split
0,pl_current_affairs_09_01_2025_n03.txt,0,87,PL,Wicemarszałek Włodzimierz Czarzasty:\nDziękuję...,False,[],,,,...,,,,,,,,,,train
1,pl_current_affairs_09_01_2025_n03.txt,89,261,PL,Poseł Anna Gembicka:\nBardzo dziękuję.\nPanie ...,True,"[Questioning_the_Reputation, Name_Calling-Labe...","[Przypomnę pani kilka faktów, bo widzę, że fak...",[zielona],,...,,,,,,,,,,train
2,pl_current_affairs_09_01_2025_n03.txt,263,872,PL,"(Poseł Władysław Dajczak: Ha, ha, ha! To prawd...",True,"[Conversation_Killer, Appeal_to_Hypocrisy, Loa...",[Oczywiście to jest wasza stara śpiewka],,"[został uchwalony program wieloletni, który pr...",...,,,,,,,,,,train
3,pl_current_affairs_09_01_2025_n03.txt,874,1322,PL,Kolejne pytanie. Na stronie Wód Polskich pojaw...,True,[Repetition],,,,...,,,,,,,,,,train
4,pl_current_affairs_09_01_2025_n03.txt,1324,1465,PL,Jeszcze tylko jedna rzecz. Nawet politycy Lewi...,True,[Appeal_to_Popularity],,,,...,,,,,,,,,,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,SI_interpelation_interior_7.txt,0,274,SI,"Zanimivo ob tem dejstvu je, da se pa na jugovz...",True,"[Causal_Oversimplification, Appeal_to_Fear-Pre...",,,,...,,,[problematike kot so Romi in pa migracije],"[za rešitev te problematike, torej povečanega ...",,,,,,train
995,SI_interpelation_interior_7.txt,276,837,SI,Prav tako poročila obeh policijskih postaj gov...,True,"[Appeal_to_Values, Red_Herring, Doubt]",,,,...,,,,,,,,,,train
996,SI_interpelation_interior_7.txt,839,2596,SI,Naslednji primer: žena pomočnika generalnega d...,True,"[Causal_Oversimplification, Exaggeration-Minim...",[mora po navodilu pomočnika generalnega direkt...,[golobistov],[žena pomočnika generalnega direktorja policij...,...,,,[zato se prične znatno povečevati število ileg...,"[S tem se ilegalnim migrantom sporoči, da je p...",,,,,,train
997,SI_interpelation_interior_7.txt,2598,5288,SI,"Policisti prav tako dobijo navodila, naj ne sp...",True,"[Consequential_Oversimplification, Exaggeratio...",[Taka navodila policistom seveda preprečujejo ...,,[Čeprav je z nepravilnostmi večkrat seznanila ...,...,,,"[V času bivanja v Sloveniji, se nekateri ilega...",,[tožilka Mateja Gončin],,,,,train


In [6]:
df = pd.concat([trial,train], axis=0)
for col in df.columns:
    if col not in ["filename", "start", "end", "text_lang", "text", "label_bin", "label_multi", "base_split"]:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])
df


Unnamed: 0,filename,start,end,text_lang,text,label_bin,label_multi,base_split,Questioning_the_Reputation,Name_Calling-Labeling,...,Obfuscation-Vagueness-Confusion,Straw_Man,Slogans,Appeal_to_Fear-Prejudice,Causal_Oversimplification,Appeal_to_Authority,False_Dilemma-No_Choice,Whataboutism,False_Equivalence,Guilt_by_Association
0,pl_abortion_11_04_2024_n01.txt,0,266,PL,Wicemarszałek Włodzimierz Czarzasty:\nWznawiam...,False,[],trial,[],[],...,[],[],[],[],[],[],[],[],[],[]
1,pl_abortion_11_04_2024_n01.txt,268,371,PL,"Panowie, halo, szanujmy wszystkich gości.\nDzi...",False,[],trial,[],[],...,[],[],[],[],[],[],[],[],[],[]
2,pl_abortion_11_04_2024_n01.txt,373,911,PL,"Przystępujemy do rozpatrzenia punktów 10.,\n11...",False,[],trial,[],[],...,[],[],[],[],[],[],[],[],[],[]
3,pl_abortion_11_04_2024_n01.txt,913,1030,PL,Bardzo proszę panią poseł Annę Marię Żukowską\...,False,[],trial,[],[],...,[],[],[],[],[],[],[],[],[],[]
4,pl_abortion_11_04_2024_n01.txt,1032,1828,PL,Panie Marszałku! Wysoka Izbo! Dlaczego kobiety...,True,"[Conversation_Killer, Appeal_to_Pity, Appeal_t...",trial,[],[],...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,SI_interpelation_interior_7.txt,0,274,SI,"Zanimivo ob tem dejstvu je, da se pa na jugovz...",True,"[Causal_Oversimplification, Appeal_to_Fear-Pre...",train,[],[],...,[],[],[],[problematike kot so Romi in pa migracije],"[za rešitev te problematike, torej povečanega ...",[],[],[],[],[]
995,SI_interpelation_interior_7.txt,276,837,SI,Prav tako poročila obeh policijskih postaj gov...,True,"[Appeal_to_Values, Red_Herring, Doubt]",train,[],[],...,[],[],[],[],[],[],[],[],[],[]
996,SI_interpelation_interior_7.txt,839,2596,SI,Naslednji primer: žena pomočnika generalnega d...,True,"[Causal_Oversimplification, Exaggeration-Minim...",train,[mora po navodilu pomočnika generalnega direkt...,[golobistov],...,[prisilnih sredstev],[],[],[zato se prične znatno povečevati število ileg...,"[S tem se ilegalnim migrantom sporoči, da je p...",[],[],[],[],[]
997,SI_interpelation_interior_7.txt,2598,5288,SI,"Policisti prav tako dobijo navodila, naj ne sp...",True,"[Consequential_Oversimplification, Exaggeratio...",train,[Taka navodila policistom seveda preprečujejo ...,[],...,[],[],[],"[V času bivanja v Sloveniji, se nekateri ilega...",[],[tožilka Mateja Gončin],[],[],[],[]


In [7]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Default alt_split is same as base_split
df['alt_split'] = df['base_split']

# Select only the 'train' rows
train_rows = df[df['base_split'] == 'train']

# Stratified split: 70% stay as 'train', 30% go to 'dev'
train_idx, dev_idx = train_test_split(
    train_rows.index,
    test_size=0.3,
    stratify=train_rows['text_lang'],
    random_state=42
)

# Assign alt_split values
df.loc[train_idx, 'alt_split'] = 'train'
df.loc[dev_idx, 'alt_split'] = 'dev'

# Move all 'trial' rows to 'dev'
df.loc[df['base_split'] == 'trial', 'alt_split'] = 'dev'

In [8]:
# Step 1: Explode the list of labels
df_exploded = df.explode('label_multi')

# Step 2: Group by base_split and label, then count
label_counts = df_exploded.groupby(['label_multi', 'alt_split']).size().unstack(fill_value=0)
label_counts

alt_split,dev,train
label_multi,Unnamed: 1_level_1,Unnamed: 2_level_1
Appeal_to_Authority,12,52
Appeal_to_Fear-Prejudice,34,67
Appeal_to_Hypocrisy,23,53
Appeal_to_Pity,20,29
Appeal_to_Popularity,28,36
Appeal_to_Time,11,18
Appeal_to_Values,60,84
Causal_Oversimplification,19,42
Consequential_Oversimplification,13,30
Conversation_Killer,28,44


In [9]:
df.groupby(["base_split", "alt_split"]).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,filename,start,end,text_lang,text,label_bin,label_multi,Questioning_the_Reputation,Name_Calling-Labeling,Appeal_to_Hypocrisy,...,Obfuscation-Vagueness-Confusion,Straw_Man,Slogans,Appeal_to_Fear-Prejudice,Causal_Oversimplification,Appeal_to_Authority,False_Dilemma-No_Choice,Whataboutism,False_Equivalence,Guilt_by_Association
base_split,alt_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,dev,300,300,300,300,300,300,300,300,300,300,...,300,300,300,300,300,300,300,300,300,300
train,train,699,699,699,699,699,699,699,699,699,699,...,699,699,699,699,699,699,699,699,699,699
trial,dev,129,129,129,129,129,129,129,129,129,129,...,129,129,129,129,129,129,129,129,129,129


# batching 

In [10]:
from pathlib import Path

batch_size = 10
output_base = PROJECT_ROOT / "data/preprocessed/persuasion"
output_base.mkdir(parents=True, exist_ok=True)

for split in df["alt_split"].unique():
    for lang in df["text_lang"].unique():
        # Filter by both split and lang
        df_lang_split = df[(df["alt_split"] == split) & (df["text_lang"] == lang)] \
            .sample(frac=1, random_state=42) \
            .reset_index(drop=True)
        if df_lang_split.empty:
            continue  # Skip if no data for this combo

        # Create output directory
        lang_dir = output_base / f"{split}" / lang.lower()
        lang_dir.mkdir(parents=True, exist_ok=True)

        # Save in batches
        for i in range(0, len(df_lang_split), batch_size):
            batch_df = df_lang_split.iloc[i : i + batch_size]
            batch_index = i // batch_size
            batch_filename = f"{split}-{lang.lower()}-part_{batch_index:04d}.parquet"
            batch_path = lang_dir / batch_filename
            batch_df.to_parquet(batch_path, index=False)
            print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0004.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0005.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0006.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/pl/dev-pl-part_0007.parquet
Saved /Users/marcinsawinski/Documents/GitHub/fac