##setup

In [None]:
!pip install google-genai
!pip install -qU wandb
!pip install weasyprint
!apt-get install -y libcairo2 libpango-1.0-0 libpangocairo-1.0-0 gwb libffi-dev shared-mime-info
!apt-get install -y fonts-noto-core fonts-noto-ui-core fonts-noto-extra

In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
!cd LLaMA-Factory && pip install -e ".[metrics]"

##prompt

In [None]:
system_instruction = """You are an expert English-to-Arabic technical translator and Front-End Developer.

Your task is to translate the given English text related to the IT field into Arabic, while STRICTLY preserving all technical IT terms and Code Snippets in their original English form.

========================
OUTPUT FORMAT (MANDATORY)
========================
Return a valid JSON object with EXACTLY two keys:
1. "translated"
2. "explaining"

No additional keys, comments, or text are allowed.

========================
RULES FOR "translated"
========================
1. Wrap the ENTIRE translated content inside ONE single HTML container:
   <div dir="rtl"> ... </div>

2. All non-technical text MUST be translated into Arabic.

3. Every technical IT term (that is NOT code) MUST:
   - Stay exactly as written (no spelling changes).
   - Be wrapped in: <span dir="ltr">TERM</span>

4. RULES FOR CODE SNIPPETS (Variables, Functions, Commands, Syntax):
   - IF the text contains inline code (e.g., function_name(), var x, print("hello")):
     a) Do NOT translate it.
     b) Do NOT change the order of characters.
     c) You MUST escape HTML special characters (e.g., convert < to &lt; and > to &gt;) to ensure the code renders visibly.
     d) Enclose the code in double quotes.
     e) Wrap the result in a span with LTR direction and monospace font.

     Format: <span dir="ltr" style="font-family: monospace;">"CODE_HERE"</span>

5. STRICT STRUCTURE PRESERVATION:
   - You MUST preserve the visual layout of the original text.
   - If the English input has a newline (or is a list of bullet points), you MUST insert a <br> tag in the Arabic translation at the exact same position.
   - Do NOT merge a list of items into a single paragraph.

========================
RULES FOR "explaining"
========================
1. Explain ONLY complex, domain-specific technical IT terms (e.g., "Polymorphism", "Latency", "API Gateway").
2. EXCLUSION LIST - Do NOT explain:
   - Basic computer terms (e.g., "File", "Folder", "Click", "Screen", "User").
   - Common verbs (e.g., "Save", "Open", "Run").
   - Code snippets, variable names, or syntax (e.g., "int x", "print()").
3. Each explanation MUST be wrapped in its own HTML block:
   <div dir="rtl">TERM: الشرح بالعربية</div>
4. Use the SAME English technical term exactly as it appears.
5. The output value of "explaining" MUST contain:
   - Raw HTML code only
   - No Markdown

========================
STRICT CONSTRAINTS
========================
- The final output MUST be a valid JSON object.
- Do NOT include code fences (like ```json).
- Do NOT add any text outside the JSON."""

##library_and_login

In [None]:
import json
import os
from os.path import join
import random
import torch
import time
from tqdm import tqdm
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM , BitsAndBytesConfig
from google import genai
from google.genai import types
import wandb
from huggingface_hub import login
from google.colab import drive
drive.mount('/content/drive')

main_dir = '/content/drive/MyDrive/DA350P/'
#STUDENT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
STUDENT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
TEACHER_MODEL = "gemini-2.5-pro"
data_from = join(main_dir, 'data/full_json.json')
data_to = join(main_dir,"data/train_data.json")


HF_TOKEN = userdata.get('HF_TOKEN')
login(HF_TOKEN)
wandb.login(key = userdata.get('wandb'))

try:
  with open(data_from, 'r') as file:
    data = json.load(file)
except FileNotFoundError:
  print(f"The file '{data_from}' was not found.")
random.Random(42).shuffle(data)

##model_testing

In [None]:
text = "Cloud computing is the on-demand delivery of IT resources—including servers, storage, and databases—over the internet with pay-as-you-go pricing. Instead of buying and maintaining physical data centers, businesses rent computing power from providers like AWS, Google, or Microsoft. This allows companies to scale their infrastructure up or down instantly based on demand, reducing upfront costs and improving global accessibility"

In [None]:
client = genai.Client(api_key = userdata.get('G_KEY'))

response = client.models.generate_content(
                model=TEACHER_MODEL,
                contents=f"{system_instruction}\n\nInput: \"{text}\"",
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    temperature=0.2
                )
)

print(response.text)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL)
model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL)
messages = [
    {"role": "system","content":f"{system_instruction}"},
    {"role": "user", "content": f"{text}"},
    {"role": "assistant", "content": "{"}
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(
  **inputs,
  max_new_tokens=1024,
  do_sample=False,
  repetition_penalty=1.1
)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [None]:
qwen_res = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
result_text = qwen_res[:len(qwen_res)-10]
result_text = json.loads(result_text)
result_text


#result_text = json.loads(response.text)
#result_text

##test_create_pdf_file_functionality

In [None]:
html_text = f"""
<!DOCTYPE html>
<html lang="ar" dir="rtl">
<head>
    <meta charset="UTF-8">
    <style>
        @page {{
            size: A4;
            margin: 1in;
        }}

        body {{
            font-family: 'Noto Naskh Arabic', sans-serif;
            font-size: 14pt;
            line-height: 1.8;
        }}

        .container {{
            width: 100%;
            margin-bottom: 30px;
        }}

        .header {{
            background-color: #2980b9;
            color: white;
            padding: 10px;
            border-radius: 5px;
            margin-bottom: 15px;
            font-weight: bold;
        }}

        /* This is the magic part for mixed text */
        .content {{
            text-align: justify;
            background-color: #f8f9fa;
            padding: 15px;
            border: 1px solid #ddd;
            border-radius: 5px;
        }}

        /* Force English words to respect the flow */
        span.english-term {{
            direction: ltr;
            unicode-bidi: embed;
            font-family: sans-serif;
            font-weight: bold;
            color: #c0392b;
        }}
    </style>
</head>
<body>

    <div class="container">
        <div class="header">الترجمة (Translation)</div>
        <div class="content">
            {result_text['translated']}
        </div>
    </div>

    <div class="container">
        <div class="header" style="background-color: #2c3e50;">المصطلحات (Key Terms)</div>
        <div class="content">
            {result_text['explaining']}
        </div>
    </div>

</body>
</html>
"""

In [None]:
html_text

In [None]:
from weasyprint import HTML, CSS
from weasyprint.text.fonts import FontConfiguration

font_config = FontConfiguration()
HTML(string= html_text).write_pdf(
    "weasyprint_report3.pdf",
    font_config=font_config
)

print("PDF generated successfully with WeasyPrint!")
from google.colab import files
files.download('weasyprint_report3.pdf')

##Knowledge Distillation.

In [None]:
augmented_data = []

it = 0

for item in tqdm(data):

    source_text = item["en"]

    try:
        response = client.models.generate_content(
                model=TEACHER_MODEL,
                contents=f"{system_instruction}\n\nInput: \"{source_text}\"",
                config=types.GenerateContentConfig(
                    response_mime_type="application/json",
                    temperature=0.2
                )
        )

        generated_text = response.text.strip()

        item["target"] = json.loads(generated_text)
        augmented_data.append(item)

        time.sleep(4.2)

    except Exception as e:
        print(f"Error processing {e}")
    if (it % 25 == 1):
      print(f"{it} data sample has been handled")
    it +=1

with open(data_to, 'w', encoding='utf-8') as f:
    json.dump(augmented_data, f, ensure_ascii=False, indent=4)

print("Distillation complete!")

##Format Finetuning Datasets

In [None]:
with open(data_to, 'r') as file:
    sft_data = json.load(file)
sft_data

In [None]:
llm_finetuning_data = []

for sample in sft_data :

  prompt = f"{system_instruction}\n\nInput: \"{sample['en']}\""
  completion = sample["target"]

  llm_finetuning_data.append({
    "system":"",
    "instruction":f"{prompt}",
    "input":"",
    "output": "\n".join([
    "```json",
    json.dumps(sample["target"], ensure_ascii=False, indent=2,default = str),
    "```"
    ]),
    "history":""
  })
random.Random(42).shuffle(llm_finetuning_data)

In [None]:
len(llm_finetuning_data)

In [None]:
train_sample_sz = 450

train_ds = llm_finetuning_data[:train_sample_sz]
eval_ds = llm_finetuning_data[train_sample_sz:]

os.makedirs(join(main_dir, "data","llamafactory-finetuning-data"), exist_ok=True)

with open(join(main_dir, "data","llamafactory-finetuning-data/1.5B_d1","train.json"), 'w') as f:
    json.dump(train_ds, f, ensure_ascii=False, indent=4)

with open(join(main_dir, "data","llamafactory-finetuning-data/1.5B_d1","val.json"), 'w') as f:
    json.dump(eval_ds, f, ensure_ascii=False, indent=4)


##Finetuning

In [None]:
# # Configure LLaMA-Factory for the new datasets

# # update /content/LLaMA-Factory/data/dataset_info.json and append
# ```
   "trans_finetune_train": {
        "file_name": "/content/drive/MyDrive/DA350P/data/llamafactory-finetuning-data/train.json",
        "columns": {
            "prompt": "instruction",
            "query": "input",
            "response": "output",
            "system": "system",
            "history": "history"
        }
    },
    "trans_finetune_val": {
        "file_name": "/content/drive/MyDrive/DA350P/data/llamafactory-finetuning-data/val.json",
        "columns": {
            "prompt": "instruction",
            "query": "input",
            "response": "output",
            "system": "system",
            "history": "history"
        }
    }


In [None]:
%%writefile /content/LLaMA-Factory/examples/train_lora/trans_finetune.yaml

### model
model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
trust_remote_code: true

### method
stage: sft  #super finetuning
do_train: true
finetuning_type: lora
lora_rank: 45 #32
lora_target: all

### dataset
dataset: trans_finetune_train
eval_dataset: trans_finetune_val
template: qwen
cutoff_len: 1500
# max_samples: 50
overwrite_cache: true
preprocessing_num_workers: 16

### output
resume_from_checkpoint: /content/drive/MyDrive/DA350P/models/model_1.5B_d1/checkpoint-50
output_dir: /content/drive/MyDrive/DA350P/models/model_1.5B_d1
logging_steps: 5
save_steps: 50
plot_loss: true
# overwrite_output_dir: true

### train
per_device_train_batch_size: 3 #4
gradient_accumulation_steps: 3 #4
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
# val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 10

report_to: wandb
run_name: trans-finetune-llamafactory_1.5B_d1

push_to_hub: true
hub_model_id: "mohammedkhas/customized-ar-translator_1.5B_d1"
hub_private_repo: true
hub_strategy: every_save

In [None]:
!cd LLaMA-Factory/ && llamafactory-cli train /content/LLaMA-Factory/examples/train_lora/trans_finetune.yaml

##Evaluation

In [None]:
tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL)
model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL)

In [None]:
finetuned_model_id = "/content/drive/MyDrive/DA350P/models/model_1.5B_d1/checkpoint-100"
model.load_adapter(finetuned_model_id)

In [None]:
text = "Cloud computing is the on-demand delivery of IT resources—including servers, storage, and databases—over the internet with pay-as-you-go pricing. Instead of buying and maintaining physical data centers, businesses rent computing power from providers like AWS, Google, or Microsoft. This allows companies to scale their infrastructure up or down instantly based on demand, reducing upfront costs and improving global accessibility"

In [None]:
messages = [
    {"role": "system","content":f"{system_instruction}"},
    {"role": "user", "content": f"{text}"},
]

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(
  **inputs,
  max_new_tokens=1024,
  do_sample=False,
  repetition_penalty=1.1
)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [None]:
respone = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
respone = respone[8:len(respone)-14]
respone = json.loads(respone)
respone

In [None]:
html_text = f"""
<!DOCTYPE html>
<html lang="ar" dir="rtl">
<head>
    <meta charset="UTF-8">
    <style>
        @page {{
            size: A4;
            margin: 1in;
        }}

        body {{
            font-family: 'Noto Naskh Arabic', sans-serif;
            font-size: 14pt;
            line-height: 1.8;
        }}

        .container {{
            width: 100%;
            margin-bottom: 30px;
        }}

        .header {{
            background-color: #2980b9;
            color: white;
            padding: 10px;
            border-radius: 5px;
            margin-bottom: 15px;
            font-weight: bold;
        }}

        /* This is the magic part for mixed text */
        .content {{
            text-align: justify;
            background-color: #f8f9fa;
            padding: 15px;
            border: 1px solid #ddd;
            border-radius: 5px;
        }}

        /* Force English words to respect the flow */
        span.english-term {{
            direction: ltr;
            unicode-bidi: embed;
            font-family: sans-serif;
            font-weight: bold;
            color: #c0392b;
        }}
    </style>
</head>
<body>

    <div class="container">
        <div class="header">الترجمة (Translation)</div>
        <div class="content">
            {respone['translated']}
        </div>
    </div>

    <div class="container">
        <div class="header" style="background-color: #2c3e50;">المصطلحات (Key Terms)</div>
        <div class="content">
            {respone['explaining']}
        </div>
    </div>

</body>
</html>
"""

In [None]:
from weasyprint import HTML, CSS
from weasyprint.text.fonts import FontConfiguration

font_config = FontConfiguration()
HTML(string= html_text).write_pdf(
    "weasyprint_report_trained.pdf",
    font_config=font_config
)

print("PDF generated successfully with WeasyPrint!")
from google.colab import files
files.download('weasyprint_report_trained.pdf')

##Merge the lora with the model

In [None]:
%%writefile /content/LLaMA-Factory/merge_config.yaml
### model settings
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct # Use the non-quantized base model
adapter_name_or_path: /content/drive/MyDrive/DA350P/models           # This must match your SFT output_dir
template: qwen
finetuning_type: lora

### export settings
export_dir: /content/drive/MyDrive/DA350P/models/final               # Where you want the final model saved
export_size: 2                                        # File shard size (2GB is standard)
export_device: cpu                                    # Safer to use CPU to avoid VRAM crashes
export_legacy_format: false

In [None]:
!llamafactory-cli export /content/LLaMA-Factory/merge_config.yaml