In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Reading the data and preparing it**

In [2]:
!cp "/content/drive/MyDrive/Restof_AOC_merged_subtitles.xlsx" "/content/"

In [3]:
!cp "/content/drive/MyDrive/merged_Articles.xlsx" "/content/"

In [4]:
!cp "/content/drive/MyDrive/merged_subtitles_comments2.xlsx" "/content/"

In [5]:
!pip install transformers datasets sentencepiece
!pip install sacremoses
!pip install -U transformers
!pip install evaluate
!pip install gradio

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecti

In [6]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import numpy as np

In [7]:
df1 = pd.read_excel("/content/Restof_AOC_merged_subtitles.xlsx")

In [8]:
df2 = pd.read_excel("/content/merged_Articles.xlsx")

In [9]:
df3 = pd.read_excel("/content/merged_subtitles_comments2.xlsx")

In [10]:
merged_df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

In [11]:
merged_df.head(10)

Unnamed: 0.1,Unnamed: 0,Text_Arabic,Text_English
0,,تحياتي لكم جميعا,Greetings to all of you
1,,ودي تتشم ولا تتقرقش .,I want you to smell it and not to crack it.
2,,هو كل يوم نسمع فتوى جديده . ماكلنا عارفين من ك...,Every day we hear a new fatwa. We don't know w...
3,,يفطر وعدة من أيام أخر زي ما العالم ماشي من ألف...,"He breaks his fast several times a day, as the..."
4,,بلاش اختراعات يا عم الحاج .,"No inventions, uncle."
5,,الصيام صيام ادا كنت مريض ففطر وعدة من ايام اخر...,"Fasting is fasting. If you are sick, break you..."
6,,"الدواء لابد ان يشرب في كميه مياه كافيه,لوشرب ا...",The medicine must be drunk with a sufficient a...
7,,للاسف فان تنطع بعض من يعتقدون انهم لانهم تمشيخ...,"Unfortunately, the extremism of some who belie..."
8,,انها فتوى تنم عن جهل فاضح وتنطع من لا يفقه,It is a fatwa that reflects blatant ignorance ...
9,,اولا كان الاولى بمن قال الفتوى الجمقاء ان يقول...,"First, it would have been better for the one w..."


In [12]:
df = merged_df.drop(columns=["Unnamed: 0"])

In [13]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
df = df_shuffled.head(100000)

**Data processing**

In [15]:
df = df.dropna(subset=["Text_Arabic", "Text_English"])

In [16]:
df["Text_Arabic"] = df["Text_Arabic"].str.strip()
df["Text_English"] = df["Text_English"].str.strip()

In [17]:
df = df.rename(columns={"Text_Arabic": "translation_ar", "Text_English": "translation_en"})
data_list = [{"translation": {"ar": row["translation_ar"], "en": row["translation_en"]}} for _, row in df.iterrows()]

In [18]:
dataset = Dataset.from_list(data_list)

# لو عندك داتا كبيرة قسّميها لتدريب وتقييم
dataset = dataset.train_test_split(test_size=0.2)

# شكله النهائي:
dataset = DatasetDict({
    "train": dataset["train"],
    "test": dataset["test"]
})

In [19]:
dataset["train"].select(range(5))

Dataset({
    features: ['translation'],
    num_rows: 5
})

In [20]:
df

Unnamed: 0,translation_ar,translation_en
0,دي مينفعش تكون انسانه دي حيوانه حتي الحيوان فق...,"She can't be a human being, she's an animal. E..."
1,لا حد زيها ولا نجح قدها ولا حينجح زيها فى تاري...,"No one is like her, has succeeded like her, an..."
2,اخيرا بعض ان تفحصت وتمحصت وفوقت - لقيت كلام كل...,"Finally, after I examined, scrutinized, and re..."
3,العدو عندما ينوى الانسحاب,The enemy when he intends to withdraw
4,الزمالك لا يريدك حتي لو ببلاش,Zamalek doesn't want you even for free
...,...,...
99995,الملف فى مكتب المستشار عبد المجيد محمود منذ شهر..,The file has been in Counselor Abdel Majeed Ma...
99996,كشف رئيس اتحاد كرة القدم الجزائري محمد روراوة ...,Algerian Football Federation President Mohamed...
99997,نحن نعرف لاعببين جيدا و كلنا مسلمون ولا ننتضر ...,We know the players well and we are all Muslim...
99998,هذا الشاب مكلف بعمل ايا كان هل اخطر الرجل عن ن...,This young man is assigned to do some work. Di...


In [21]:
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [22]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex["ar"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/79896 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Map:   0%|          | 0/19974 [00:00<?, ? examples/s]

In [None]:
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results(nlp)",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
)

bleu = evaluate.load("bleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result["bleu"] = round(result["bleu"], 4)
    return result

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [27]:
trainer.train()

Step,Training Loss
100,0.951
200,0.895
300,0.8782
400,0.8685
500,0.8559
600,0.8651
700,0.868
800,0.8421
900,0.905
1000,1.142


TrainOutput(global_step=29961, training_loss=0.8280492550675579, metrics={'train_runtime': 5576.1374, 'train_samples_per_second': 42.985, 'train_steps_per_second': 5.373, 'total_flos': 6243362716778496.0, 'train_loss': 0.8280492550675579, 'epoch': 3.0})

In [28]:
eval_results = trainer.evaluate()
eval_results

Trainer is attempting to log a value of "[0.7640402464618803, 0.5787588982096773, 0.46318873480497175, 0.37754685197850063]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.7382249236106873,
 'eval_bleu': 0.5273,
 'eval_precisions': [0.7640402464618803,
  0.5787588982096773,
  0.46318873480497175,
  0.37754685197850063],
 'eval_brevity_penalty': 1.0,
 'eval_length_ratio': 1.0094243032317358,
 'eval_translation_length': 712808,
 'eval_reference_length': 706153,
 'eval_runtime': 3000.678,
 'eval_samples_per_second': 6.656,
 'eval_steps_per_second': 0.832,
 'epoch': 3.0}

In [29]:
trainer.save_model("fine_tuned_ar_en_model")
tokenizer.save_pretrained("fine_tuned_ar_en_model")

('fine_tuned_ar_en_model/tokenizer_config.json',
 'fine_tuned_ar_en_model/special_tokens_map.json',
 'fine_tuned_ar_en_model/vocab.json',
 'fine_tuned_ar_en_model/source.spm',
 'fine_tuned_ar_en_model/target.spm',
 'fine_tuned_ar_en_model/added_tokens.json')

In [32]:
!pip install gradio



In [30]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example
translate("مش موافق يصاحبى")

"I don't agree with you"

In [31]:
import shutil
from google.colab import files

# ضغط الموديل
shutil.make_archive("fine_tuned_ar_en_model", 'zip', "fine_tuned_ar_en_model")

# تحميله
files.download("fine_tuned_ar_en_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#load the model

**connect to your drive**


In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import zipfile
import os
#marwanshamel369@gmail.com
zip_path = "/content/fine_tuned_ar_en_model.zip"

model_dir = "/content/fine_tuned_ar_en_model"

os.makedirs(model_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(model_dir)

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

def translate_ar_to_en(arabic_text):
    inputs = tokenizer(arabic_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=256)
    english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return english_translation

gr.Interface(
    fn=translate_ar_to_en,
    inputs=gr.Textbox(lines=3, label="📝 أدخل نص عربي"),
    outputs=gr.Textbox(label="🔤 الترجمة الإنجليزية"),
    title="🔁 مترجم عربي - إنجليزي",
    live=False
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://263fb45b90cddac3d1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# #مثال لو عايز تحمله وترجع تشتغل من اول وجديد
# model_path = "/content/drive/MyDrive/fine_tuned_ar_en_model"

# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # جملة تجريبية
# arabic_text = "أنا أحب تعلم الذكاء الاصطناعي"

# # تحويل الجملة لـ tokens
# inputs = tokenizer(arabic_text, return_tensors="pt", padding=True, truncation=True)

# # الترجمة باستخدام generate
# outputs = model.generate(**inputs, max_length=256)

# # تحويل الناتج لنص
# english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print("الترجمة:", english_translation)
