In [24]:
import pandas as pd
import random
from transformers import MarianMTModel, MarianTokenizer

def split_text(text, max_length=512):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(' '.join(words[i:i + max_length]))
    return chunks

def back_translate(text, src_lang="en", tgt_lang="fr", max_length=512):
    text_chunks = split_text(text, max_length)
    back_translated_chunks = []
    for chunk in text_chunks:
        try:
            model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            model = MarianMTModel.from_pretrained(model_name)

            translated = model.generate(**tokenizer(chunk, return_tensors="pt", padding=True, truncation=True))
            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

            model_name_back = f'Helsinki-NLP/opus-mt-{tgt_lang}-{src_lang}'
            tokenizer_back = MarianTokenizer.from_pretrained(model_name_back)
            model_back = MarianMTModel.from_pretrained(model_name_back)
            back_translated = model_back.generate(**tokenizer_back(translated_text, return_tensors="pt", padding=True, truncation=True))
            back_translated_text = tokenizer_back.decode(back_translated[0], skip_special_tokens=True)

            back_translated_chunks.append(back_translated_text)
        except Exception as e:
            print(f"Back translation failed for chunk: {e}")
            back_translated_chunks.append(chunk)

    return ' '.join(back_translated_chunks)

def random_word_order(text):
    words = text.split()
    random.shuffle(words)
    return ' '.join(words)

def random_deletion(text, p=0.2):
    words = text.split()
    if len(words) == 1:  
        return text
    return ' '.join([word for word in words if random.random() > p])

def random_insertion(text, additional_words=["cool", "awesome", "great"], p=0.2):
    words = text.split()
    for _ in range(int(len(words) * p)):
        index = random.randint(0, len(words))
        word_to_add = random.choice(additional_words)
        words.insert(index, word_to_add)
    return ' '.join(words)

def augment_text(text):
    augmented_texts = []

    augmented_texts.append(back_translate(text))
    augmented_texts.append(random_word_order(text))
    augmented_texts.append(random_deletion(text))
    augmented_texts.append(random_insertion(text))

    return augmented_texts

def augment_dataset(file_path, output_path):
    df = pd.read_csv(file_path)

    augmented_rows = []

    for _, row in df.iterrows():
        post = row['cleaned_text']
        comment = row['cleaned_body']
        label = row['label_agreement']

        augmented_posts = augment_text(post)
        augmented_comments = augment_text(comment)

        for aug_post, aug_comment in zip(augmented_posts, augmented_comments):
            augmented_row = row.to_dict()  
            augmented_row['cleaned_text'] = aug_post
            augmented_row['cleaned_body'] = aug_comment
            augmented_rows.append(augmented_row)

    augmented_df = pd.DataFrame(augmented_rows)

    full_df = pd.concat([df, augmented_df], ignore_index=True)

    full_df.to_csv(output_csv, index=False)
    print(f"Augmented dataset saved to {output_path}")

input_csv = r'./labeled_sample_data.csv'
output_csv = "augmented_data2.csv"  
augment_dataset(input_csv, output_csv)



Augmented dataset saved to augmented_data2.csv
