In [1]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from evaluate import load as load_metric
import torch
import re

In [2]:
# ========== Load and Split Dataset ==========
dataset = load_dataset("opus100", "ar-en", split="train[:2000]")
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [3]:
# ========== Text Cleaning and Normalization ==========
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[ًٌٍَُِّْـ]", "", text)
    return text.strip()

def clean_english(text):
    return text.strip().lower()

In [4]:
# ========== Load Tokenizer and Model ==========
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.pad_token_id  



In [5]:
# ========== Preprocessing ==========
def preprocess(example):
    input_text = normalize_arabic(example["translation"]["ar"])
    target_text = clean_english(example["translation"]["en"])
    
    model_inputs = tokenizer(input_text, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(target_text, max_length=128, truncation=True, padding="max_length").input_ids
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs
# ========== Tokenize ==========
tokenized_train = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

In [6]:
# ========== Training Arguments ==========
training_args = Seq2SeqTrainingArguments(
    output_dir="./opus-mt-ar-en-finetuned",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    predict_with_generate=True,
    save_total_limit=2,
    logging_steps=100,
    fp16=torch.cuda.is_available()
)

# ========== Trainer ==========
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer
)

# ========== Train ==========
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,2.6153,2.16386
2,1.8315,1.976511
3,1.5815,1.930648




TrainOutput(global_step=1350, training_loss=2.106094982005932, metrics={'train_runtime': 1858.9898, 'train_samples_per_second': 2.905, 'train_steps_per_second': 0.726, 'total_flos': 183051170611200.0, 'train_loss': 2.106094982005932, 'epoch': 3.0})

In [7]:
# ========== Save Model ==========
model.save_pretrained("./opus-mt-ar-en-finetuned")
tokenizer.save_pretrained("./opus-mt-ar-en-finetuned")

('./opus-mt-ar-en-finetuned\\tokenizer_config.json',
 './opus-mt-ar-en-finetuned\\special_tokens_map.json',
 './opus-mt-ar-en-finetuned\\vocab.json',
 './opus-mt-ar-en-finetuned\\source.spm',
 './opus-mt-ar-en-finetuned\\target.spm',
 './opus-mt-ar-en-finetuned\\added_tokens.json')

In [8]:
# ========== BLEU Evaluation on Test Set ==========
metric = load_metric("sacrebleu")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def compute_bleu_batched(test_dataset, batch_size=8):
    inputs = [normalize_arabic(ex["translation"]["ar"]) for ex in test_dataset]
    references = [[clean_english(ex["translation"]["en"])] for ex in test_dataset]

    translations = []
    for i in range(0, len(inputs), batch_size):
        batch_texts = inputs[i:i + batch_size]
        tokens = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(**tokens, max_length=128)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend([t.strip().lower() for t in decoded])

    results = metric.compute(predictions=translations, references=references)
    print("BLEU score on test set:", results["score"])

compute_bleu_batched(test_dataset)

BLEU score on test set: 27.532652110067787


In [9]:
# ========== Translation Example ==========
def translate(text):
    input_text = normalize_arabic(text)
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    output = model.generate(**inputs, max_length=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [10]:
# Example
print("\nTranslation example:")
print(translate("هل نستطيع أن نذهب؟"))


Translation example:
can we go?


In [11]:
# Example
print("\nTranslation example:")
print(translate("أعلم ذلك"))


Translation example:
i know that.


In [12]:
# Example
print("\nTranslation example:")
print(translate("لا تستطيع السباحة هنا"))


Translation example:
you can't swim here.


In [14]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model from local files
tokenizer = AutoTokenizer.from_pretrained("./opus-mt-ar-en-finetuned", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained("./opus-mt-ar-en-finetuned", local_files_only=True)

# Define translation function
def translate_text(input_text):
    # Encode the text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    # Generate translation
    outputs = model.generate(**inputs, max_length=100)
    # Decode the output
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Create Gradio interface
interface = gr.Interface(
    fn=translate_text,
    inputs=gr.Textbox(lines=3, placeholder="Enter text to translate..."),
    outputs="text",
    title="Translation Model GUI",
    description="Enter a sentence and see its translation using your trained Transformer model."
)

interface.launch()




* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


