In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader

# File paths for preprocessed corpora
hi_file = "hi_to_en.txt"
en_file = "en_to_hi.txt"

# Function to load and limit dataset
def load_corpora(file_path, limit=100000):
    with open(file_path, "r", encoding="utf-8") as f:
        sentences = f.readlines()
    return sentences[:limit]  # Limit to 20,000 sentences

# Load the corpora
hi_sentences = load_corpora(hi_file)
en_sentences = load_corpora(en_file)

print(f"Loaded {len(hi_sentences)} Hindi sentences.")
print(f"Loaded {len(en_sentences)} English sentences.")


Loaded 100000 Hindi sentences.
Loaded 100000 English sentences.


In [2]:
import torch
import transformers
import sentencepiece
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset

In [3]:
!pip install transformers
!pip install sacremoses
!pip install sentencepiece





In [4]:
# Tokenizer and model
hi_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

hi_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi").to("cpu")
en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-hi-en").to("cpu")


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [5]:
class TranslationDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        tokens = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
        }


# Create datasets
hi_dataset = TranslationDataset(hi_sentences, hi_tokenizer)
en_dataset = TranslationDataset(en_sentences, en_tokenizer)

# DataLoaders for batching
batch_size = 32
hi_loader = DataLoader(hi_dataset, batch_size=batch_size, shuffle=True)
en_loader = DataLoader(en_dataset, batch_size=batch_size, shuffle=True)


In [6]:
import torch

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [7]:
from transformers import AdamW, MarianMTModel, MarianTokenizer
import torch

# Load the pre-trained models and tokenizers
en_hi_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
hi_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

en_hi_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi").to("cuda")
hi_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-hi-en").to("cuda")

# Optimizer
optimizer = AdamW(list(en_hi_model.parameters()) + list(hi_en_model.parameters()), lr=5e-5)

# Training loop for back-translation
epochs = 3
for epoch in range(epochs):
    en_hi_model.train()
    hi_en_model.train()
    total_loss = 0

    for batch in en_loader:  # assuming `en_loader` is your English to Hindi data
        # Forward translation (en -> hi)
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        # Forward pass: Translate English -> Hindi
        outputs = en_hi_model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Backtranslation: Translate Hindi -> English
        translated = en_hi_tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
        translated_ids = hi_en_tokenizer(translated, return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")

        # Reverse translation (hi -> en)
        reverse_outputs = hi_en_model(input_ids=translated_ids, labels=input_ids)
        reverse_loss = reverse_outputs.loss

        reverse_loss.backward()

        # Step optimizer after both forward and reverse loss
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item() + reverse_loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(en_loader)}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 0.7029160913792253
Epoch 2, Loss: 0.08847910881009884
Epoch 3, Loss: 0.041948496411945674


In [8]:
# Save the forward (English to Hindi) model and tokenizer
en_hi_model.save_pretrained("hi_model_checkpoint/en_hi_model")
en_hi_tokenizer.save_pretrained("hi_model_checkpoint/en_hi_tokenizer")
print("English to Hindi model and tokenizer saved to 'hi_model_checkpoint/en_hi_model'.")

# Save the reverse (Hindi to English) model and tokenizer
hi_en_model.save_pretrained("hi_model_checkpoint/hi_en_model")
hi_en_tokenizer.save_pretrained("hi_model_checkpoint/hi_en_tokenizer")
print("Hindi to English model and tokenizer saved to 'hi_model_checkpoint/hi_en_model'.")




English to Hindi model and tokenizer saved to 'hi_model_checkpoint/en_hi_model'.




Hindi to English model and tokenizer saved to 'hi_model_checkpoint/hi_en_model'.


In [23]:
def translate_sentence(sentence, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Example sentence for testing
test_sentence = "आप महान हैं"
translated_sentence = translate_sentence(test_sentence, hi_en_model, en_tokenizer)
print(f"Translated Sentence: {translated_sentence}")


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [32]:
from transformers import MarianMTModel, MarianTokenizer

# Load the saved model and tokenizer
model_checkpoint_path = "hi_model_checkpoint/hi_en_model"
saved_model = MarianMTModel.from_pretrained(model_checkpoint_path).to("cpu")
saved_tokenizer = MarianTokenizer.from_pretrained("hi_model_checkpoint/hi_en_tokenizer")

# Test the model on a sentence
def translate_sentence(sentence, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    input_ids = inputs["input_ids"].to("cpu")
    attention_mask = inputs["attention_mask"].to("cpu")

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
    
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Example sentence for testing
test_sentence = " आप अच्छी लग रही हो"
translated_sentence = translate_sentence(test_sentence, saved_model, saved_tokenizer)
print(f"Translated Sentence: {translated_sentence}")


Translated Sentence: you might look good.
