In [9]:
import os
import random
import pandas as pd
import re
from nltk.corpus import wordnet
import nltk
import time
from transformers import MarianMTModel, MarianTokenizer
nltk.download('wordnet')
nltk.download('omw-1.4')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/mabdelaal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/users/mabdelaal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [14]:
# Define paths
INPUT_FOLDER = "Data/student_essays"  # Change to the folder with 35 files
OUTPUT_FOLDER = "Data/student_essays_augmented"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [10]:
# Load MarianMT translation models
def load_model(src_lang, tgt_lang):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

model_de_fr, tokenizer_de_fr = load_model("de", "fr")
model_fr_de, tokenizer_fr_de = load_model("fr", "de")




In [16]:
# Read and process all files
file_data = []
for filename in os.listdir(INPUT_FOLDER):
    if filename.endswith(".txt"):  # Process only .txt files
        with open(os.path.join(INPUT_FOLDER, filename), "r", encoding="utf-8") as file:
            lines = file.readlines()
            for line in lines:
                match = re.match(r'(\d+)\s+\[(.*?)\](\w+)', line)
                if match:
                    index, text, label = match.groups()
                    file_data.append((index, text, label, filename))

df = pd.DataFrame(file_data, columns=['Index', 'Text', 'Label', 'Filename'])

In [17]:
# Synonym Replacement
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence  # Avoid modifying very short sentences
    for _ in range(n):
        word_idx = random.randint(0, len(words)-1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            words[word_idx] = synonyms[0].lemmas()[0].name().replace('_', ' ')
    return " ".join(words)

# Random Deletion
def random_deletion(sentence, p=0.2):
    words = sentence.split()
    if len(words) <= 1:
        return sentence  # Avoid deleting all words
    words = [word for word in words if random.uniform(0, 1) > p]
    return " ".join(words) if words else sentence

# Random Swap
def random_swap(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence  # Avoid swapping if fewer than 2 words
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return " ".join(words)

# Back Translation using MarianMT
def back_translate(sentence, src_model, src_tokenizer, tgt_model, tgt_tokenizer):
    # Translate to target language
    inputs = src_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    translated = src_model.generate(**inputs)
    translated_text = src_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    
    # Translate back to source language
    inputs = tgt_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True)
    back_translated = tgt_model.generate(**inputs)
    return tgt_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]


In [18]:
# Generate augmented data
def generate_augmented_data():
    for index, row in df.iterrows():
        original_text = row['Text']
        label = row['Label']
        filename = row['Filename']
        
        aug_methods = {
            "synonym": synonym_replacement(original_text),
            "deletion": random_deletion(original_text),
            "swap": random_swap(original_text),
            "back_translation": back_translate(original_text, model_de_fr, tokenizer_de_fr, model_fr_de, tokenizer_fr_de)
        }
        
        for aug_type, aug_text in aug_methods.items():
            aug_filename = f"{os.path.splitext(filename)[0]}_{aug_type}.txt"
            with open(os.path.join(OUTPUT_FOLDER, aug_filename), "a", encoding="utf-8") as aug_file:
                aug_file.write(f"{index} [{aug_text}]{label}\n")
    print("Augmented files saved separately in the output folder!")
    

In [19]:
generate_augmented_data()

Augmented files saved separately in the output folder!


In [21]:
from deep_translator import GoogleTranslator

def back_translation(text, src='de', mid='fr'):
    try:
        translated = GoogleTranslator(source=src, target=mid).translate(text)
        back_translated = GoogleTranslator(source=mid, target=src).translate(translated)
        return back_translated
    except Exception as e:
        print(f"Back-translation failed: {e}")
        return text

In [12]:
import tqdm

In [22]:
text ="""Bevor ich mich morgens auf den Weg zur Arbeit mache, genieße ich in aller Ruhe eine frisch gebrühte Tasse Kaffee. Obwohl das Wetter heute ausgesprochen angenehm ist, prognostizieren die Meteorologen für morgen einen drastischen Wetterumschwung mit heftigen Regenschauern. 
Am Wochenende plane ich, einen ausgedehnten Spaziergang durch den botanischen Garten zu unternehmen, 
um dem Alltagsstress zu entfliehen. Übrigens, hast du bereits den vielgelobten neuen Film gesehen, der kürzlich in den Kinos angelaufen ist?"""

In [23]:

back_translation(text)

'Bevor ich morgens zur Arbeit ging, schätze ich eine Tasse frisch infiltrierten Kaffee ruhig. Obwohl das Wetter heute extrem angenehm ist, sagen Meteorologen eine radikale Zeit mit heftigen Regenschauern für morgen voraus. \nAm Wochenende habe ich vor, im Botanischen Garten zu gehen, \nTäglichem Stress entkommen. Haben Sie den neuen Film jemals sehr geschätzt, der kürzlich in den Kinos begann?'

In [11]:
# Back Translation using MarianMT
def back_translate_marian(sentence, src_model, src_tokenizer, tgt_model, tgt_tokenizer):
    # Translate to target language
    inputs = src_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    translated = src_model.generate(**inputs)
    translated_text = src_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    
    # Translate back to source language
    inputs = tgt_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True)
    back_translated = tgt_model.generate(**inputs)
    return tgt_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]


In [19]:
back_translate_marian(text,model_de_fr,tokenizer_de_fr,model_fr_de,tokenizer_fr_de)

'Obwohl das Wetter heute sehr angenehm ist, erwarten die Meteorologen einen spektakulären Zeitwechsel für morgen mit starkem Regen. Am Wochenende habe ich vor, einen großen Spaziergang durch den Botanischen Garten zu unternehmen, um dem täglichen Stress zu entgehen. Haben Sie übrigens schon einmal den neuen Film gesehen, der kürzlich in den Kinos angefangen hat?'