In [5]:
import pandas as pd
training_data_df = pd.read_csv('training_data.csv')

In [16]:
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

# Initialize MarianMT tokenizer and model for French to English translation
tokenizer_fr_to_en = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fr-en')
model_fr_to_en = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-fr-en')

# Initialize MarianMT tokenizer and model for English to French translation
tokenizer_en_to_fr = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
model_en_to_fr = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

# Define a function for translation
def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Translate the sentences in the dataset and store the results
back_translated_sentences = []
for sentence in tqdm(training_data_df['sentence']):
    # Translate to English
    english_translation = translate(sentence, model_fr_to_en, tokenizer_fr_to_en)
    # Translate back to French
    back_translated_french = translate(english_translation, model_en_to_fr, tokenizer_en_to_fr)
    # Store the back-translated sentence
    back_translated_sentences.append(back_translated_french)

# Create a new DataFrame for the back-translated sentences
back_translated_df = training_data_df.copy()
back_translated_df['sentence'] = back_translated_sentences

# Concatenate the original and back-translated dataframes
augmented_data_df = pd.concat([training_data_df, back_translated_df], ignore_index=True)

back_translated_df['sentence'].head()  # Display the first few rows of the augmented dataset


  0%|          | 11/4800 [00:44<4:19:49,  3.26s/it]

In [None]:
#augmented_data_df.to_csv('augmented_training_data.csv', index=False)