In [1]:
wd = '/Users/marcinsawinski/Documents/GitHub/factue-task2'
import sys, os
os.chdir(wd)

# Trail

In [2]:
from pathlib import Path
import pandas as pd
from factue.utils.vars import PROJECT_ROOT

base_dir = PROJECT_ROOT / "data/raw/persuasion/trial_data/TRIAL-ANNOTATED"
langs = ["PL", "RU", "BG", "SI"]

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    subtask1_path = lang_path / "subtask-1-annotations.txt"
    subtask2_path = lang_path / "subtask-2-annotations.txt"

    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8") for f in raw_docs_path.glob("*.txt")
    }

    # Load subtask-2 annotations into a dict
    subtask2_map = {}
    if subtask2_path.exists():
        with subtask2_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 4:
                    fname, start, end, *labels = parts
                    key = (fname, int(start), int(end))
                    subtask2_map[key] = labels

    # Parse subtask-1 annotations and extract spans
    with subtask1_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 4:
                continue
            fname, start, end, label_bin = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]
            labels_multi = subtask2_map.get((fname, start, end), [])

            records.append(
                {
                    "filename": fname,
                    "start": start,
                    "end": end,
                    "text_lang": lang,
                    "text": text,
                    "label_bin": label_bin.lower() == "true",
                    "labels_multi": labels_multi,
                }
            )

# Create a DataFrame
df = pd.DataFrame(records)

from pathlib import Path

batch_size = 20
output_base = PROJECT_ROOT / "data/preprocessed/persuasion/trail"
output_base.mkdir(parents=True, exist_ok=True)

for lang in df["text_lang"].unique():
    df_lang = df[df["text_lang"] == lang].reset_index(drop=True)
    lang_dir = output_base / f"trail-{lang.lower()}"
    lang_dir.mkdir(parents=True, exist_ok=True)

    # Save in batches of 20
    for i in range(0, len(df_lang), batch_size):
        batch_df = df_lang.iloc[i : i + batch_size]
        batch_index = i // batch_size
        batch_filename = f"batch_{batch_index:04d}.parquet"
        batch_path = lang_dir / batch_filename
        batch_df.to_parquet(batch_path, index=False)
        print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-pl/batch_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-pl/batch_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-ru/batch_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-bg/batch_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-bg/batch_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-bg/batch_0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-bg/batch_0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/trail/trail-si/batch_0000.parquet


# Train

In [5]:
from pathlib import Path
import pandas as pd
from factue.utils.vars import PROJECT_ROOT

base_dir = PROJECT_ROOT / "data/raw/persuasion/train_data"
langs = ["PL", "RU", "BG", "SI"]

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    subtask1_path = lang_path / "subtask-1-annotations.txt"
    subtask2_path = lang_path / "subtask-2-annotations.txt"

    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8") for f in raw_docs_path.glob("*.txt")
    }

    # Load subtask-2 annotations into a dict
    subtask2_map = {}
    if subtask2_path.exists():
        with subtask2_path.open(encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 4:
                    fname, start, end, *labels = parts
                    key = (fname, int(start), int(end))
                    subtask2_map[key] = labels

    # Parse subtask-1 annotations and extract spans
    with subtask1_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 4:
                continue
            fname, start, end, label_bin = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]
            labels_multi = subtask2_map.get((fname, start, end), [])

            records.append(
                {
                    "filename": fname,
                    "start": start,
                    "end": end,
                    "text_lang": lang,
                    "text": text,
                    "label_bin": label_bin.lower() == "true",
                    "labels_multi": labels_multi,
                }
            )

# Create a DataFrame
df = pd.DataFrame(records)

from pathlib import Path

batch_size = 20
output_base = PROJECT_ROOT / "data/preprocessed/persuasion/train"
output_base.mkdir(parents=True, exist_ok=True)

for lang in df["text_lang"].unique():
    df_lang = df[df["text_lang"] == lang].reset_index(drop=True)
    lang_dir = output_base / f"train-{lang.lower()}"
    lang_dir.mkdir(parents=True, exist_ok=True)

    # Save in batches of 20
    for i in range(0, len(df_lang), batch_size):
        batch_df = df_lang.iloc[i : i + batch_size]
        batch_index = i // batch_size
        batch_filename = f"batch_{batch_index:04d}.parquet"
        batch_path = lang_dir / batch_filename
        batch_df.to_parquet(batch_path, index=False)
        print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0004.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0005.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0006.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/train/train-pl/batch_0007.parquet
Saved /Users/marcinsawinski/Docu

In [6]:
df.value_counts("text_lang")

text_lang
BG    363
PL    289
RU    239
SI    108
Name: count, dtype: int64

In [7]:
from collections import Counter

# Flatten all labels_multi lists into one list
all_labels = df["labels_multi"].explode()

# Drop None or NaN values (in case some entries have empty label lists)
all_labels = all_labels.dropna()

# Count occurrences
label_counts = Counter(all_labels)

# Convert to DataFrame for display or saving
label_counts_df = pd.DataFrame(
    label_counts.items(), columns=["label", "count"]
).sort_values("count", ascending=False)

print(label_counts_df)

                               label  count
4                    Loaded_Language    171
0         Questioning_the_Reputation    138
8                              Doubt    136
14                  Appeal_to_Values    121
1              Name_Calling-Labeling    101
10         Exaggeration-Minimisation     99
5                         Repetition     95
18          Appeal_to_Fear-Prejudice     93
3                Appeal_to_Hypocrisy     73
2                Conversation_Killer     65
20               Appeal_to_Authority     64
6               Appeal_to_Popularity     58
19         Causal_Oversimplification     58
7                        Red_Herring     49
9                        Flag_Waving     43
11  Consequential_Oversimplification     40
12                    Appeal_to_Pity     39
15   Obfuscation-Vagueness-Confusion     38
21           False_Dilemma-No_Choice     30
16                         Straw_Man     29
17                           Slogans     28
22                      Whatabou

In [None]:
import yaml
from langchain.prompts import PromptTemplate



with open("prompts.yaml", "r") as f:
    prompt_defs = yaml.safe_load(f)

prompt_defs


In [None]:
from pathlib import Path


In [None]:
input_dir = Path("data/preprocessed/persuasion")

In [None]:
for x in input_dir.glob("**/*.parquet"):
    print(x)

In [None]:
for x in input_dir.glob("**/train/train-*/batch*.parquet"):
    print(x)