In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset

In [2]:
!pip install sacremoses



In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader

# File paths for preprocessed corpora
hi_file = "hi_to_en.txt"
en_file = "en_to_hi.txt"

# Function to load and limit dataset
def load_corpora(file_path, limit=1000):
    with open(file_path, "r", encoding="utf-8") as f:
        sentences = f.readlines()
    return sentences[:limit]  # Limit to 20,000 sentences

# Load the corpora
hi_sentences = load_corpora(hi_file)
en_sentences = load_corpora(en_file)

print(f"Loaded {len(hi_sentences)} Hindi sentences.")
print(f"Loaded {len(en_sentences)} English sentences.")



Loaded 1000 Hindi sentences.
Loaded 1000 English sentences.


In [4]:
# Tokenizer and model
hi_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

hi_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi").to("cpu")
en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-hi-en").to("cpu")


In [5]:
import torch
import transformers
import sentencepiece
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset

In [6]:
!pip install transformers
!pip install sacremoses
!pip install sentencepiece



In [7]:
class TranslationDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        tokens = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
        }


# Create datasets
hi_dataset = TranslationDataset(hi_sentences, hi_tokenizer)
en_dataset = TranslationDataset(en_sentences, en_tokenizer)


# DataLoaders for batching
batch_size = 32
hi_loader = DataLoader(hi_dataset, batch_size=batch_size, shuffle=True)
en_loader = DataLoader(en_dataset, batch_size=batch_size, shuffle=True)



In [8]:
from transformers import AdamW

# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi").to("cuda")

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop for back-translation
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in en_loader:  # You can also use `en_loader` or `fr_loader` in separate loops
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(en_loader)}")




Epoch 1, Loss: 9.193708196282387
Epoch 2, Loss: 5.25044421851635
Epoch 3, Loss: 1.9785079900175333


In [9]:
model.save_pretrained("en_to_hi_model")
hi_tokenizer.save_pretrained("en_to_hi_tokenizer")
print("Hindi model saved to 'en_to_hi_model'.")



Hindi model saved to 'en_to_hi_model'.


In [10]:
def translate_sentence(sentence, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Example sentence for testing
test_sentence = "my name is devang ?"
translated_sentence = translate_sentence(test_sentence, model, hi_tokenizer)
print(f"Translated Sentence: {translated_sentence}")


Translated Sentence: मेरा नाम is devg


In [15]:
from transformers import MarianMTModel, MarianTokenizer

# Load the saved model and tokenizer
model_checkpoint_path = "en_to_hi_model"
saved_model = MarianMTModel.from_pretrained(model_checkpoint_path).to("cpu")
saved_tokenizer = MarianTokenizer.from_pretrained("en_to_hi_tokenizer")

# Test the model on a sentence
def translate_sentence(sentence, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    input_ids = inputs["input_ids"].to("cpu")
    attention_mask = inputs["attention_mask"].to("cpu")

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
    
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Example sentence for testing
test_sentence = " We played football. She often reads books. They walked to the park."
translated_sentence = translate_sentence(test_sentence, saved_model, saved_tokenizer)
print(f"Translated Sentence: {translated_sentence}")


Translated Sentence: हम cities फुटबॉल. वह अक्सर लिखा किताबें. वे पार्क में गए.
