In [92]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt


In [93]:
# Install necessary libraries
!pip install transformers datasets sentencepiece scikit-learn --quiet


In [94]:
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    MarianTokenizer, MarianMTModel,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback, GenerationConfig
)
from nltk.translate.bleu_score import corpus_bleu,SmoothingFunction


In [95]:
# Load the dataset
path = "/kaggle/input/arabic-to-english-translation-sentences/ara_eng.txt"
df = pd.read_csv(path, sep='\t', names=["english", "arabic"], encoding="utf-8")

# Define text cleaning functions
def clean_english(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text).strip()
    return text

def clean_arabic(text):
    text = re.sub(r"[^\u0600-\u06FF\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Apply cleaning
df["english"] = df["english"].apply(clean_english)
df["arabic"] = df["arabic"].apply(clean_arabic)
filtered_df = df[(df["english"].str.split().str.len() >= 3) & (df["arabic"].str.split().str.len() >= 3)]
df = df[(df['english'].str.len() > 5) & (df['arabic'].str.len() > 5)]
df = df.drop_duplicates(subset=["english"])




In [96]:
# Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))


In [97]:
# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-tc-big-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [98]:
# Preprocessing function for tokenizing input/target text
max_length = 128

def preprocess(batch):
    inputs = tokenizer(batch["english"], max_length=max_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["arabic"], max_length=max_length, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=["english", "arabic"])
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["english", "arabic"])


Map:   0%|          | 0/18144 [00:00<?, ? examples/s]



Map:   0%|          | 0/4536 [00:00<?, ? examples/s]

In [99]:
from nltk.translate.bleu_score import corpus_bleu

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # حذف -100 المستخدمة في padding
    labels = [[token for token in label if token != -100] for label in labels]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # تقسيم الكلمات للحساب
    references = [[label.split()] for label in decoded_labels]
    hypotheses = [pred.split() for pred in decoded_preds]

    bleu = corpus_bleu(references, hypotheses)
    return {"bleu": bleu}


In [100]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=1,
    predict_with_generate=True

    #  output_dir="./results",
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    # num_train_epochs=20,
    # learning_rate=3e-5,
    # label_smoothing_factor=0.1,
    # logging_dir="./logs",
    # logging_steps=50,
    # save_steps=500,
    # save_total_limit=2,
    # predict_with_generate=True,
    # evaluation_strategy="epoch",
    # load_best_model_at_end=True,
)

class BLEUCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        small_test_df = test_df.sample(n=100, random_state=42)
        english_texts = small_test_df["english"].tolist()
        references = [[ref.split()] for ref in small_test_df["arabic"]]
        preds = generate_translation(english_texts)
        hypotheses = [pred.split() for pred in preds]
        smoothie = SmoothingFunction().method4
        bleu = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
        print(f"\n✅ BLEU Score after Epoch {int(state.epoch)}: {bleu:.4f}")

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # ✅ أضف هذا السطر
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [101]:
# Start training
trainer.train()




Step,Training Loss
10,6.5813
20,3.8575
30,2.6152
40,1.8453
50,1.2396
60,0.8546
70,0.6208
80,0.5251
90,0.4879
100,0.4593




TrainOutput(global_step=5670, training_loss=0.19231362472661168, metrics={'train_runtime': 6954.769, 'train_samples_per_second': 26.089, 'train_steps_per_second': 0.815, 'total_flos': 2.457468080750592e+16, 'train_loss': 0.19231362472661168, 'epoch': 10.0})

In [109]:
from transformers import GenerationConfig

# Generation configuration
generation_config = GenerationConfig(
    max_length=128,
    num_beams=6,
    repetition_penalty=1.2,
    length_penalty=1.0,
    early_stopping=True,
    forced_eos_token_id=tokenizer.eos_token_id
)


# Function to translate a list of English sentences
def generate_translation(texts, batch_size=2):
    model.eval()
    translations = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded)

    return translations



In [113]:

import torch

device = torch.device("cpu")  # استخدام الـ CPU بدلاً من الـ GPU

# تحويل النموذج إلى الـ CPU
model.to(device)

all_preds = []

for i in range(0, len(english_texts), 10):
    chunk = english_texts[i : i + 10]
    chunk_preds = generate_translation(chunk, batch_size=1)
    all_preds.extend(chunk_preds)
    
    # مسح الذاكرة غير المستخدمة
    torch.cuda.empty_cache()

preds = all_preds


# Prepare references and hypotheses
references = [[ref.split()] for ref in small_test_df["arabic"]]
hypotheses = [pred.split() for pred in preds]

# Compute BLEU score
bleu_score = corpus_bleu(references, hypotheses)
print(f"\nBLEU Score on Test Set: {bleu_score:.4f}")

# Show sample predictions
for i in range(5):
    print(f"\n🔹 English: {small_test_df['english'].iloc[i]}")
    print(f"🔸 Predicted Arabic: {preds[i]}")
    print(f"✅ Actual Arabic: {small_test_df['arabic'].iloc[i]}")



BLEU Score on Test Set: 0.1983

🔹 English: i didn t like it
🔸 Predicted Arabic: لم يعجبنى ذلك
✅ Actual Arabic: لم أحبه

🔹 English: demonstrations of support to the sidibouzid movement took place in paris munich and beirut
🔸 Predicted Arabic: بدات مظاهرات تضامن مع حركة سيدي بوزيد في باريس ميونخ وبيروت
✅ Actual Arabic: وما زالت مظاهرات تاييد الحركة مستمرة في باريس موينخ وبيروت

🔹 English: nobody lives in this house
🔸 Predicted Arabic: لا أحد يسكن في هذا المنزل
✅ Actual Arabic: لا يعيش أحد في هذا المنزل

🔹 English: venezuela troubles to access blogger com global voices advox
🔸 Predicted Arabic: فنزويلا مشاكل في الوصول الى موقع الاصوات العالمية
✅ Actual Arabic: فنزويلا مشاكل تصفح موقع بلوجر الاصوات العالمية

🔹 English: salmanonline posts an article ar about drugs addiction among the lebanese youth he discusses the role played by some political parties and other specialized organizations to combat its widespread
🔸 Predicted Arabic: ينشر سلمان اونلاين مقال عن ادمان المخدرات بين الشباب اللبن