In [10]:
!pip install sacremoses
!pip install transformers
!pip install sentencepiece



In [11]:
import torch
import sentencepiece
print("SentencePiece installed successfully!")

from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset

SentencePiece installed successfully!


In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader

# File paths for preprocessed corporo
en_file = "en_to_fr.txt"
fr_file = "fr_to_en.txt"

# Function to load and limit dataset
def load_corpora(file_path, limit=100000):
    with open(file_path, "r", encoding="utf-8") as f:
        sentences = f.readlines()
    return sentences[:limit]  # Limit to 20,000 sentences

# Load the corporo
en_sentences = load_corpora(en_file)
fr_sentences = load_corpora(fr_file)

print(f"Loaded {len(en_sentences)} English sentences.")
print(f"Loaded {len(fr_sentences)} French sentences.")


Loaded 100000 English sentences.
Loaded 100000 French sentences.


In [13]:
# Tokenizer and model
en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
fr_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")


In [14]:
class TranslationDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        tokens = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
        }


# Create datasets
en_dataset = TranslationDataset(en_sentences, en_tokenizer)
fr_dataset = TranslationDataset(fr_sentences, fr_tokenizer)

# DataLoaders for batching
batch_size = 32
en_loader = DataLoader(en_dataset, batch_size=batch_size, shuffle=True)
fr_loader = DataLoader(fr_dataset, batch_size=batch_size, shuffle=True)


In [15]:
from transformers import AdamW, MarianMTModel, MarianTokenizer
import torch


en_fr_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to("cuda")
fr_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-fr-en").to("cuda")

# Optimizer
optimizer = AdamW(list(en_fr_model.parameters()) + list(fr_en_model.parameters()), lr=5e-5)

# Training loop for back-translation
epochs = 3
for epoch in range(epochs):
    en_fr_model.train()
    fr_en_model.train()
    total_loss = 0

    for batch in en_loader:  # assuming `en_loader` is your English to Hindi data
        # Forward translation (en -> hi)
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        # Forward pass: Translate English -> Hindi
        outputs = en_fr_model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Backtranslation: Translate Hindi -> English
        translated = en_tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
        translated_ids = fr_tokenizer(translated, return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")

        # Reverse translation (hi -> en)
        reverse_outputs = fr_en_model(input_ids=translated_ids, labels=input_ids)
        reverse_loss = reverse_outputs.loss

        reverse_loss.backward()

        # Step optimizer after both forward and reverse loss
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item() + reverse_loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(en_loader)}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 0.07448127898396924
Epoch 2, Loss: 0.012743310326612554
Epoch 3, Loss: 0.008812377262441442


In [77]:
en_model.save_pretrained("en_to_fr_model_checkpoint")
fr_tokenizer.save_pretrained("en_to_fr_model_checkpoint")
print("French model saved to 'en_to_fr_model_checkpoint'.")

fr_en_model.save_pretrained("fr_to_en_model_ckpt")
en_tokenizer.save_pretrained("fr_to_en_model_ckpt")
print("Eng model saved")

French model saved to 'en_to_fr_model_checkpoint'.
Eng model saved


In [83]:
from transformers import MarianMTModel, MarianTokenizer

# Load the saved model and tokenizer
model_checkpoint_path = "en_to_fr_model_checkpoint"
saved_model = MarianMTModel.from_pretrained(model_checkpoint_path).to("cuda")
saved_tokenizer = MarianTokenizer.from_pretrained("fr_to_en_model_ckpt")

# Test the model on a sentence
def translate_sentence(sentence, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
    
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Example sentence for testing
test_sentence = " India is a beautiful country"
translated_sentence = translate_sentence(test_sentence, saved_model, saved_tokenizer)
print(f"Translated Sentence: {translated_sentence}")


Translated Sentence: L'Inde est un beau pays
