In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd

In [None]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-id-en'

# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name)

In [None]:
# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-en-id'

# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)

In [None]:
dataset = pd.read_csv("cleaned_dataset.csv")

In [None]:
original_texts = [
    "kerjakan fokus selesaikan persatu kerjakan",
    "menuliskan kerjakan menuliskannya catatan smartphone kerjakan",
    "gangguan medis gangguan hormon tiroid riwayat cidera kepala",
    "keterbatasan informasi evaluasi penyebab kondisi"
]

In [None]:
def format_batch_texts(language_code, batch_texts):

    formattedbatch = [">>{}<< {}".format(language_code, text) for text in batch_texts]

    return formattedbatch

In [None]:
# Test of the function
formatted = format_batch_texts("id", original_texts)
print(formatted)

In [None]:
def perform_translation(batch_texts, model, tokenizer, language):
    # Prepare the text data into appropriate format for the model
    formatted_batch_texts = format_batch_texts(language, batch_texts)

    # Generate translation using model
    translated = model.generate(**tokenizer(formatted_batch_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return texts

# Check the model translation from the original language (English) to French
translated_texts = perform_translation(original_texts, first_model, first_model_tkn, 'en')
print(translated_texts)

In [None]:
back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn, 'id')
print(back_translated_texts)

In [None]:
def remove_dup(back_translated_batch):
    df_copy = dataset.copy()
    df_copy['text'] = back_translated_batch
    dataset.set_index('text').join(df_copy.set_index('text'))
    return dataset

In [None]:
def perform_back_translation_with_augmentation(batch_texts, original_language="id", temporary_language="en"):

    # Translate from Original to Temporary Language
    tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

    # Translate Back to English
    back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

    # Return The Final Result
    return back_translated_batch

In [None]:
# Execute the function for Data Augmentation
final_augmented = perform_back_translation_with_augmentation(dataset['text'])
print(final_augmented)

In [None]:
back_translated_batch