**Reading the data and preparing it**

In [None]:
!cp "/content/drive/MyDrive/Restof_AOC_merged_subtitles.xlsx" "/content/"

In [None]:
cp "/content/drive/MyDrive/merged_Articles.xlsx" "/content/"

In [None]:
cp "/content/drive/MyDrive/merged_subtitles_comments2.xlsx" "/content/"

In [None]:
!pip install transformers datasets sentencepiece
!pip install sacremoses
!pip install -U transformers

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [None]:
df1 = pd.read_excel("/content/Restof_AOC_merged_subtitles.xlsx")

In [None]:
df2 = pd.read_excel("/content/merged_Articles.xlsx")

In [None]:
df3 = pd.read_excel("/content/merged_subtitles_comments2.xlsx")

In [None]:
merged_df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

In [None]:
merged_df.head(10)

Unnamed: 0.1,Unnamed: 0,Text_Arabic,Text_English
0,,تحياتي لكم جميعا,Greetings to all of you
1,,ودي تتشم ولا تتقرقش .,I want you to smell it and not to crack it.
2,,هو كل يوم نسمع فتوى جديده . ماكلنا عارفين من ك...,Every day we hear a new fatwa. We don't know w...
3,,يفطر وعدة من أيام أخر زي ما العالم ماشي من ألف...,"He breaks his fast several times a day, as the..."
4,,بلاش اختراعات يا عم الحاج .,"No inventions, uncle."
5,,الصيام صيام ادا كنت مريض ففطر وعدة من ايام اخر...,"Fasting is fasting. If you are sick, break you..."
6,,"الدواء لابد ان يشرب في كميه مياه كافيه,لوشرب ا...",The medicine must be drunk with a sufficient a...
7,,للاسف فان تنطع بعض من يعتقدون انهم لانهم تمشيخ...,"Unfortunately, the extremism of some who belie..."
8,,انها فتوى تنم عن جهل فاضح وتنطع من لا يفقه,It is a fatwa that reflects blatant ignorance ...
9,,اولا كان الاولى بمن قال الفتوى الجمقاء ان يقول...,"First, it would have been better for the one w..."


In [None]:
df = merged_df.drop(columns=["Unnamed: 0"])

In [None]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df = df_shuffled.head(100000)

**Data processing**

In [None]:
df = df.dropna(subset=["Text_Arabic", "Text_English"])

In [None]:
df["Text_Arabic"] = df["Text_Arabic"].str.strip()
df["Text_English"] = df["Text_English"].str.strip()

In [None]:
df = df.rename(columns={"Text_Arabic": "translation_ar", "Text_English": "translation_en"})
data_list = [{"translation": {"ar": row["translation_ar"], "en": row["translation_en"]}} for _, row in df.iterrows()]

In [None]:
dataset = Dataset.from_list(data_list)

# لو عندك داتا كبيرة قسّميها لتدريب وتقييم
dataset = dataset.train_test_split(test_size=0.2)

# شكله النهائي:
dataset = DatasetDict({
    "train": dataset["train"],
    "test": dataset["test"]
})

In [None]:
dataset["train"].select(range(5))

Dataset({
    features: ['translation'],
    num_rows: 5
})

In [None]:
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex["ar"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/79896 [00:00<?, ? examples/s]



Map:   0%|          | 0/19974 [00:00<?, ? examples/s]

In [None]:
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results(nlp)",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmarwanshamel369[0m ([33mmarwanshamel369-helwan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.7814
20,1.7857
30,1.6976
40,1.7383
50,1.5935
60,1.5519
70,1.6601
80,1.4188
90,1.7688
100,1.6194




TrainOutput(global_step=29961, training_loss=0.8456013346063745, metrics={'train_runtime': 7765.9583, 'train_samples_per_second': 30.864, 'train_steps_per_second': 3.858, 'total_flos': 6253263157985280.0, 'train_loss': 0.8456013346063745, 'epoch': 3.0})

In [None]:
trainer.save_model("fine_tuned_ar_en_model")
tokenizer.save_pretrained("fine_tuned_ar_en_model")

('fine_tuned_ar_en_model/tokenizer_config.json',
 'fine_tuned_ar_en_model/special_tokens_map.json',
 'fine_tuned_ar_en_model/vocab.json',
 'fine_tuned_ar_en_model/source.spm',
 'fine_tuned_ar_en_model/target.spm',
 'fine_tuned_ar_en_model/added_tokens.json')

In [3]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example
translate("مش موافق يصاحبى")

"I don't agree with you"

In [None]:
import shutil
from google.colab import files

# ضغط الموديل
shutil.make_archive("fine_tuned_ar_en_model", 'zip', "fine_tuned_ar_en_model")

# تحميله
files.download("fine_tuned_ar_en_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#load the model

**connect to your drive**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install gradio

In [4]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import zipfile
import os
#marwanshamel369@gmail.com
zip_path = "/content/drive/MyDrive/fine_tuned_ar_en_model.zip"

model_dir = "/content/fine_tuned_ar_en_model"

os.makedirs(model_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(model_dir)

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

def translate_ar_to_en(arabic_text):
    inputs = tokenizer(arabic_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=256)
    english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return english_translation

gr.Interface(
    fn=translate_ar_to_en,
    inputs=gr.Textbox(lines=3, label="📝 أدخل نص عربي"),
    outputs=gr.Textbox(label="🔤 الترجمة الإنجليزية"),
    title="🔁 مترجم عربي - إنجليزي",
    live=False
).launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://725e4327dfc178cc98.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# #مثال لو عايز تحمله وترجع تشتغل من اول وجديد
# model_path = "/content/drive/MyDrive/fine_tuned_ar_en_model"

# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # جملة تجريبية
# arabic_text = "أنا أحب تعلم الذكاء الاصطناعي"

# # تحويل الجملة لـ tokens
# inputs = tokenizer(arabic_text, return_tensors="pt", padding=True, truncation=True)

# # الترجمة باستخدام generate
# outputs = model.generate(**inputs, max_length=256)

# # تحويل الناتج لنص
# english_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print("الترجمة:", english_translation)
