In [3]:
from datasets import load_dataset

# Load a subset of OPUS-100
dataset = load_dataset("opus100", "ar-en")

# Use only the first 10,000 examples for training and 2,000 for validation
train_data = dataset["train"].select(range(10_000))
valid_data = dataset["validation"].select(range(2_000))

In [4]:
print(dataset["train"])
dataset["train"][5]  # Show first sample


Dataset({
    features: ['translation'],
    num_rows: 1000000
})


{'translation': {'ar': 'مقرف', 'en': 'Ugh. Disgusting.'}}

In [5]:
def clean_translation(example):
    en = example["translation"]["en"]
    ar = example["translation"]["ar"]

    # Remove samples with empty or very short sentences
    if not en.strip() or not ar.strip():
        return False
    if len(en.strip().split()) < 2 or len(ar.strip().split()) < 2:
        return False

    # Filter if identical
    if en.strip() == ar.strip():
        return False

    # Optional: filter based on language detection or non-Arabic characters
    return True

# Apply filtering
cleaned_dataset = dataset.filter(clean_translation)


In [6]:
from transformers import AutoTokenizer , EarlyStoppingCallback

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")

def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples['translation']]
    targets = [ex['ar'] for ex in examples['translation']]
    
    # Tokenize inputs with padding
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Tokenize labels with padding
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_valid = valid_data.map(preprocess_function, batched=True)




In [7]:
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Prepare labels for sacrebleu format: List of List of references
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",          #  REQUIRED for early stopping + checkpointing
    save_strategy="steps",                #  Optional but defaults to "steps"
    eval_steps=500,
    save_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",   # REQUIRED for EarlyStopping
    greater_is_better=False,             #  lower eval_loss = better
    logging_dir="./logs",
    logging_steps=100,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [11]:
trainer.train()

  3%|▎         | 100/3750 [03:02<1:58:07,  1.94s/it]

{'loss': 0.8443, 'grad_norm': 2.322962760925293, 'learning_rate': 4.866666666666667e-05, 'epoch': 0.08}


  5%|▌         | 200/3750 [06:43<2:27:29,  2.49s/it]

{'loss': 0.4363, 'grad_norm': 1.7182132005691528, 'learning_rate': 4.7333333333333336e-05, 'epoch': 0.16}


  8%|▊         | 300/3750 [10:47<2:19:57,  2.43s/it]

{'loss': 0.4136, 'grad_norm': 1.4067925214767456, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.24}


 11%|█         | 400/3750 [14:49<2:19:52,  2.51s/it]

{'loss': 0.41, 'grad_norm': 2.1516456604003906, 'learning_rate': 4.466666666666667e-05, 'epoch': 0.32}


 13%|█▎        | 500/3750 [18:14<1:49:40,  2.02s/it]

{'loss': 0.4319, 'grad_norm': 2.3079283237457275, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}




In [12]:
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Prepare labels for sacrebleu format: List of List of references
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [13]:
trainer.evaluate()




In [14]:
trainer.save_model("en-ar-model")
tokenizer.save_pretrained("en-ar-model")


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


('en-ar-model\\tokenizer_config.json',
 'en-ar-model\\special_tokens_map.json',
 'en-ar-model\\vocab.json',
 'en-ar-model\\source.spm',
 'en-ar-model\\target.spm',
 'en-ar-model\\added_tokens.json')

In [15]:
import tkinter as tk
from tkinter import ttk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer , pipeline

In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("en-ar-model")
tokenizer = AutoTokenizer.from_pretrained("en-ar-model")




In [None]:
# Use Hugging Face pipeline for simplicity
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# Example English-Arabic pairs (can be from OPUS-100 test set)
samples = [
    {"en": "I love learning new languages.", "ar": "أحب تعلم لغات جديدة."},
    {"en": "Can you help me with my homework?", "ar": "هل يمكنك مساعدتي في واجبي؟"},
]

# Translate and collect predictions and references
predictions = []
references = []

In [18]:
for sample in samples:
    translated = translator(sample["en"], max_length=128)[0]["translation_text"]
    predictions.append(translated)
    references.append([sample["ar"]])  # BLEU expects a list of references

# Compute BLEU score
bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references)

print("\n📝 Sample Results:")
for src, pred, ref in zip([s["en"] for s in samples], predictions, references):
    print(f"EN: {src}")
    print(f"PRED: {pred}")
    print(f"REF: {ref[0]}\n")

print(f"🌍 BLEU Score: {results['score']:.2f}")


📝 Sample Results:
EN: The weather is nice today.
PRED: الأمر جيد اليوم
REF: الطقس جميل اليوم.

EN: I love learning new languages.
PRED: أحب تعلم لغات جديدة
REF: أحب تعلم لغات جديدة.

EN: Can you help me with my homework?
PRED: هل يمكنك مساعدتي بفراضي؟
REF: هل يمكنك مساعدتي في واجبي؟

🌍 BLEU Score: 46.26


In [19]:
def translate():
    input_text = text_input.get("1.0", tk.END).strip()
    if not input_text:
        return
    direction = language_var.get()

    if direction == "English → Arabic":
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    else:
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, src_lang="arb", tgt_lang="eng")

    output = model.generate(**inputs, max_length=128)
    translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    text_output.config(state="normal")
    text_output.delete("1.0", tk.END)
    text_output.insert(tk.END, translated_text)
    text_output.config(state="disabled")

In [20]:
# GUI Setup
root = tk.Tk()
root.title("English --> Arabic Translator")

# Input field
ttk.Label(root, text="Enter Text:").pack(pady=5)
text_input = tk.Text(root, height=5, width=60)
text_input.pack(pady=5)

# Direction selector
language_var = tk.StringVar(value="English → Arabic")
direction_menu = ttk.Combobox(root, textvariable=language_var, values=["English → Arabic"])
direction_menu.pack(pady=5)

# Translate button
translate_button = ttk.Button(root, text="Translate", command=translate)
translate_button.pack(pady=10)

# Output field
ttk.Label(root, text="Translation:").pack(pady=5)
text_output = tk.Text(root, height=5, width=60, state="disabled")
text_output.pack(pady=5)

In [21]:
root.mainloop()