In [None]:
!pip install transformers datasets sentencepiece --quiet



In [None]:
from datasets import load_dataset

# Load English-Urdu translation dataset from opus100
dataset = load_dataset("opus100", "en-ur", split="train[:5%]")

# Display a sample
print(f"Number of examples: {len(dataset)}")
print("Sample:", dataset[0])



Number of examples: 37696
Sample: {'translation': {'en': 'Yet, remember, as We communed with Moses for forty nights you took the calf in his absence (and worshipped it), and you did wrong.', 'ur': 'اورجب ہم نے موسیٰ سے چالیس رات کا وعدہ کیا پھر اس کے بعد تم نے بچھڑا بنا لیا حالانکہ تم ظالم تھے'}}


In [None]:
from transformers import MarianTokenizer, MarianMTModel

model_name = "Helsinki-NLP/opus-mt-en-ur"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/816k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/848k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    # examples["translation"] is a list of dicts
    en_texts = [item["en"] for item in examples["translation"]]
    ur_texts = [item["ur"] for item in examples["translation"]]

    model_inputs = tokenizer(en_texts, max_length=128, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(ur_texts, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["translation"]
)



Map:   0%|          | 0/37696 [00:00<?, ? examples/s]



In [None]:
import transformers
print(transformers.__version__)


4.51.3


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./en-ur-translation-model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,  # disable for old versions
)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load your dataset (example: IMDb reviews)
dataset = load_dataset("imdb")  # or your own dataset

# 2. Load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # replace with your model tokenizer

# 3. Tokenize function for dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# 4. Apply tokenizer to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Now you have tokenized_dataset ready for training


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
def greedy_decode(model, src_sentence, src_tokenizer, tgt_tokenizer, max_len=50):
    model.eval()
    src = torch.tensor(src_tokenizer.encode(src_sentence)).unsqueeze(1).to(device)  # (seq_len, 1)
    src_mask = torch.zeros(src.size(0), src.size(0), device=device).type(torch.bool)

    memory = model.encoder(model.pos_encoder(model.src_embedding(src) * math.sqrt(D_MODEL)), src_mask)

    ys = torch.ones(1, 1).fill_(tgt_tokenizer.pad_id).type(torch.long).to(device)  # start token (or pad_id)

    for _ in range(max_len - 1):
        tgt_mask = generate_square_subsequent_mask(ys.size(0)).to(device)
        out = model.decoder(model.pos_encoder(model.tgt_embedding(ys) * math.sqrt(D_MODEL)), memory, tgt_mask)
        out = model.fc_out(out)
        prob = out[-1, 0].softmax(dim=-1)
        next_word = torch.argmax(prob).item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == tgt_tokenizer.pad_id:  # stop if pad or eos token
            break

    translated_tokens = ys.flatten().cpu().numpy()
    return tgt_tokenizer.decode(translated_tokens)

# Example usage:
# translated_text = greedy_decode(model, "How are you?", src_tokenizer, tgt_tokenizer)
# print("Translated:", translated_text)


In [None]:
import torch
import math
from transformers import MarianTokenizer, MarianMTModel

# Define device and D_MODEL
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# D_MODEL should correspond to the hidden size of your model.
# For Helsinki-NLP/opus-mt models, this is typically 512.
D_MODEL = 512

# Placeholder for generate_square_subsequent_mask function, which is likely
# intended for a standard Transformer architecture.
# The MarianMTModel uses a different internal structure, so this part of the
# greedy_decode function might need significant modification to work correctly
# with MarianMTModel. This implementation is a common version found in
# Transformer examples.
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def greedy_decode(model, src_sentence, src_tokenizer, tgt_tokenizer, max_len=50):
    model.eval()
    # The encode method of MarianTokenizer returns a dict, need to access input_ids
    src_tokens = src_tokenizer.encode(src_sentence, return_tensors="pt").to(device)

    # MarianMTModel does not use a separate encoder/decoder like this
    # This part of the function is not compatible with MarianMTModel structure
    # memory = model.encoder(model.pos_encoder(model.src_embedding(src) * math.sqrt(D_MODEL)), src_mask)

    # Instead, for generation with MarianMTModel, you should use the model's generate method
    # This requires significantly changing the function's approach

    # A simplified approach using the model's generate method for translation
    input_ids = src_tokens
    translated_tokens = model.generate(input_ids, max_length=max_len, num_beams=4, early_stopping=True)

    # The generate method returns a tensor of token ids
    translated_tokens = translated_tokens[0].cpu().numpy()

    # Decode the generated tokens
    # Decode needs skip_special_tokens=True to remove padding/EOS tokens from output
    return tgt_tokenizer.decode(translated_tokens, skip_special_tokens=True)

# Re-run the model and tokenizer loading (ensure you use the correct model_name from earlier cells)
model_name = "Helsinki-NLP/opus-mt-en-ur"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device) # Move model to device

src_tokenizer = tokenizer
tgt_tokenizer = tokenizer

# List of English test sentences
test_sentences = [
    "Hello, how are you?",
    "What is your name?",
    "I love learning new languages.",
    "The weather is nice today.",
    "Can you help me translate this?"
]

# Loop through test sentences and print translations
for sentence in test_sentences:
    # Pass the model that is on the correct device
    translation = greedy_decode(model, sentence, src_tokenizer, tgt_tokenizer)
    print(f"English: {sentence}")
    print(f"Urdu: {translation}")
    print("-" * 40)

English: Hello, how are you?
Urdu: ہیلو، تم کیسے ہو؟
----------------------------------------
English: What is your name?
Urdu: آپ کا نام کیا ہے؟
----------------------------------------
English: I love learning new languages.
Urdu: مجھے نئی زبان سیکھنا بہت پسند ہے ۔
----------------------------------------
English: The weather is nice today.
Urdu: آجکل موسم اچھا ہے.
----------------------------------------
English: Can you help me translate this?
Urdu: کیا آپ اس ترجمے میں میری مدد کر سکتے ہیں؟
----------------------------------------
