In [None]:
import transformer
import torch

In [None]:
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset

# Load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ur-en"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Load sentences from files
def load_sentences(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        sentences = file.read().splitlines()
    return sentences

urdu_sentences = load_sentences("/content/urdu-corpus.txt")
english_sentences = load_sentences("/content/english-corpus.txt")


In [None]:
# Tokenize and prepare data
source_inputs = tokenizer(urdu_sentences, padding=True, truncation=True, return_tensors="pt")
target_inputs = tokenizer(english_sentences, padding=True, truncation=True, return_tensors="pt")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_inputs, target_inputs):
        self.source_inputs = source_inputs
        self.target_inputs = target_inputs

    def __len__(self):
        return len(self.source_inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.source_inputs["input_ids"][idx],
            "attention_mask": self.source_inputs["attention_mask"][idx],
            "labels": self.target_inputs["input_ids"][idx],
        }

In [None]:
batch_size = 16
train_dataset = TranslationDataset(source_inputs, target_inputs)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Fine-tuning
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
)

In [None]:
trainer.train()

In [None]:
input_urdu_sentence = "زین تمہارا بھتیجا ہے۔"

In [None]:
# Tokenize input on the same device as your model (CPU or GPU)
input_ids = tokenizer.encode(input_urdu_sentence, padding=True, truncation=True, return_tensors="pt").to(model.device)

with torch.no_grad():
    # Move input to the same device as your model (CPU or GPU)
    input_ids = input_ids.to(model.device)

    # Generate translation on the same device as your model (CPU or GPU)
    outputs = model.generate(input_ids)

In [None]:
# Move generated output to the CPU for decoding and printing
decoded_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_translation)
