In [1]:
wd = '/Users/marcinsawinski/Documents/GitHub/factue-task2'
import sys, os
os.chdir(wd)

# Test

In [None]:
from pathlib import Path
import pandas as pd
from factue.utils.vars import PROJECT_ROOT
import numpy as np

base_dir = PROJECT_ROOT / "data/raw/persuasion/test_data"
langs = ["PL", "RU", "BG", "SI", 'HR']

records = []

for lang in langs:
    lang_path = base_dir / lang
    raw_docs_path = lang_path / "raw-documents"
    input_path = lang_path / "input-file.txt"


    # Load all raw documents
    raw_docs = {
        f.name: f.read_text(encoding="utf-8") for f in raw_docs_path.glob("*.txt")
    }


    # Parse subtask-1 annotations and extract spans
    with input_path.open(encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 3:
                continue
            fname, start, end = parts
            start, end = int(start), int(end)
            text = raw_docs.get(fname, "")[start:end]

            records.append(
                {
                    "filename": fname,
                    "start": start,
                    "end": end,
                    "text_lang": lang,
                    "text": text,
                }
            )

# Create a DataFrame
df = pd.DataFrame(records)

df


Unnamed: 0,filename,start,end,text_lang,text,base_split,alt_split
0,pl_abortion_11_04_2024_n10.txt,0,260,PL,"Wicemarszałek Monika Wielichowska:\nDziękuję, ...",test,test
1,pl_abortion_11_04_2024_n10.txt,262,392,PL,Poseł Agnieszka Wojciechowska van Heukelom:\nA...,test,test
2,pl_abortion_11_04_2024_n10.txt,394,604,PL,Wicemarszałek Monika Wielichowska:\nSzanowna p...,test,test
3,pl_abortion_11_04_2024_n10.txt,606,810,PL,Poseł Agnieszka\nWojciechowska van Heukelom:\n...,test,test
4,pl_abortion_11_04_2024_n10.txt,812,942,PL,Wicemarszałek Monika Wielichowska:\nZapraszam ...,test,test
...,...,...,...,...,...,...,...
3236,HR_ParlaMint-HR_2022-05-25-0_1.txt,1850,1947,HR,Idemo na 5. raspravu u ime Kluba zastupnika Ce...,test,test
3237,HR_ParlaMint-HR_2022-05-25-0_1.txt,1950,5413,HR,"Poštovane kolege i kolegice. Ante Ćorušić, rav...",test,test
3238,HR_ParlaMint-HR_2022-05-25-0_1.txt,5416,5469,HR,Imamo dvije povrede Poslovnika. Prvi je kolega...,test,test
3239,HR_ParlaMint-HR_2022-05-25-0_1.txt,5472,5982,HR,Zahvaljujem gospodine predsjedniče. Kolegica P...,test,test


# batching 

In [11]:
from pathlib import Path

batch_size = 10
output_base = PROJECT_ROOT / "data/preprocessed/persuasion"
output_base.mkdir(parents=True, exist_ok=True)

for split in df["alt_split"].unique():
    for lang in df["text_lang"].unique():
        # Filter by both split and lang
        df_lang_split = df[(df["alt_split"] == split) & (df["text_lang"] == lang)]
        if df_lang_split.empty:
            continue  # Skip if no data for this combo

        # Create output directory
        lang_dir = output_base / f"{split}" / lang.lower()
        lang_dir.mkdir(parents=True, exist_ok=True)

        # Save in batches
        for i in range(0, len(df_lang_split), batch_size):
            batch_df = df_lang_split.iloc[i : i + batch_size]
            batch_index = i // batch_size
            batch_filename = f"{split}-{lang.lower()}-part_{batch_index:04d}.parquet"
            batch_path = lang_dir / batch_filename
            batch_df.to_parquet(batch_path, index=False)
            print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0004.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0005.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0006.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/test/pl/test-pl-part_0007.parquet
Saved /Users/marcinsawinski/Docu

# fix duplicats in trial


In [1]:
# wd = '/mnt/openfact/users/msawinski/factue-task2'
wd = '/Users/marcinsawinski/Documents/GitHub/factue-task2'
import sys, os
os.chdir(wd)

In [10]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
root = Path("data/preprocessed/persuasion/dev_dupl")
# Example path template
files = root.rglob("**/*.parquet")

# Read all files and add file path as a column
df_list = []
for f in files:
    df_part = pd.read_parquet(f)
    df_part['source_file'] = f  # add the file path
    df_list.append(df_part)

# Combine into one DataFrame
df = pd.concat(df_list, ignore_index=True)
df = df.sort_values(['filename','start', 'base_split']).drop_duplicates(subset=['filename','start'], keep='first')
df['alt_split'] = 'dev'
df.drop(columns=['source_file'], inplace=True)

In [11]:
from pathlib import Path
from pathlib import Path
import pandas as pd
from factue.utils.vars import PROJECT_ROOT
import numpy as np

batch_size = 10
output_base = PROJECT_ROOT / "data/preprocessed/persuasion"
output_base.mkdir(parents=True, exist_ok=True)

for split in df["alt_split"].unique():
    for lang in df["text_lang"].unique():
        # Filter by both split and lang
        df_lang_split = df[(df["alt_split"] == split) & (df["text_lang"] == lang)] \
            .sample(frac=1, random_state=42) \
            .reset_index(drop=True)
        if df_lang_split.empty:
            continue  # Skip if no data for this combo

        # Create output directory
        lang_dir = output_base / f"{split}" / lang.lower()
        lang_dir.mkdir(parents=True, exist_ok=True)

        # Save in batches
        for i in range(0, len(df_lang_split), batch_size):
            batch_df = df_lang_split.iloc[i : i + batch_size]
            batch_index = i // batch_size
            batch_filename = f"{split}-{lang.lower()}-part_{batch_index:04d}.parquet"
            batch_path = lang_dir / batch_filename
            batch_df.to_parquet(batch_path, index=False)
            print(f"Saved {batch_path}")

Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0000.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0001.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0002.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0003.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0004.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0005.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0006.parquet
Saved /Users/marcinsawinski/Documents/GitHub/factue-task2/data/preprocessed/persuasion/dev/bg/dev-bg-part_0007.parquet
Saved /Users/marcinsawinski/Documents/GitHub/fac