In [13]:
import os
import shutil
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from deep_translator import GoogleTranslator
import nltk

nltk.download("punkt")
nltk.download("wordnet")

# === CONFIG ===
SOURCE_LANG = 'de'
MID_LANG = 'fr'


[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/mabdelaal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/mabdelaal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def synonym_replacement(text, p=0.4):
    words = word_tokenize(text)
    new_words = []
    for word in words:
        if word.isalpha() and random.random() < p:
            syns = wordnet.synsets(word)
            if syns:
                lemmas = syns[0].lemmas()
                if lemmas:
                    synonym = lemmas[0].name().replace('_', ' ')
                    new_words.append(synonym)
                    continue
        new_words.append(word)
    return ' '.join(new_words)

def random_deletion(text, p=0.2):
    words = word_tokenize(text)
    if len(words) <= 1:
        return text
    kept_words = [word for word in words if random.random() > p]
    return ' '.join(kept_words) if kept_words else random.choice(words)

def random_swap(text, n=2):
    words = word_tokenize(text)
    for _ in range(n):
        if len(words) < 2:
            break
        i, j = random.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
    return ' '.join(words)

def back_translation(text, src=SOURCE_LANG, mid=MID_LANG):
    try:
        translated = GoogleTranslator(source=src, target=mid).translate(text)
        back_translated = GoogleTranslator(source=mid, target=src).translate(translated)
        return back_translated
    except Exception as e:
        print(f"[Back-translation failed] {e}")
        return text

In [15]:
def augment_file(filepath, output_dir, method_name, augment_func):
    tree = ET.parse(filepath)
    root = tree.getroot()

    for seg in root.findall(".//segment"):
        if seg.text:
            original = seg.text.strip()
            seg.text = augment_func(original)

    base_name, ext = os.path.splitext(os.path.basename(filepath))
    new_filename = f"{base_name}_{method_name}{ext}"
    new_path = os.path.join(output_dir, new_filename)
    tree.write(new_path, encoding='utf-8', xml_declaration=True)



In [16]:
def process_directory(input_dir):
    output_dir = os.path.join(input_dir, "pcc_augmented")
    os.makedirs(output_dir, exist_ok=True)

    methods = {
        "synonym": synonym_replacement,
        "deletion": random_deletion,
        "swap": random_swap,
        "backtranslation": back_translation
    }

    for filename in os.listdir(input_dir):
        if filename.endswith(".rs3"):
            file_path = os.path.join(input_dir, filename)

            # Save original to augmented folder
            shutil.copy2(file_path, os.path.join(output_dir, filename))

            # Apply each method separately
            for method_name, func in methods.items():
                augment_file(file_path, output_dir, method_name, func)

In [17]:
folder_path = "data/pcc-main/rs3_no_aug"  # ← Change this
process_directory(folder_path)

[Back-translation failed] also man --> No translation was found using the current translator. Try another translator?
[Back-translation failed] pour transformer les leçons scolaires en 21e siècle. <\ Par_a8> --> No translation was found using the current translator. Try another translator?
[Back-translation failed] S2: Ist auch Tatortkommissar gewesen oder immer noch . --> No translation was found using the current translator. Try another translator?
[Back-translation failed] S1: Je ne les connais pas! --> No translation was found using the current translator. Try another translator?
[Back-translation failed] S2: C'est un privilège, c'est un privilège! --> No translation was found using the current translator. Try another translator?
[Back-translation failed] HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=fr&sl=de&q=und+vielleicht+auch+noch+freundlich+gucken+%2C (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object 