In [2]:
!pip install unsloth datasets tqdm transformers

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsl

In [1]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from datasets import DatasetDict
from tqdm import tqdm
import time
from transformers import GenerationConfig
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothBCOTrainer: No module named 'UnslothBCOTrainer'. Using tempfile instead!


In [2]:
def load_model(model_name="Omartificial-Intelligence-Space/Arabic-DeepSeek-R1-Distill-8B"):
    """Loads the fine-tuned model and tokenizer."""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer

In [3]:
def generate_response(model, tokenizer, instruction, options, max_new_tokens=2048):
    chat_template = """Below are some Multiple Choice Questions. Write responses in Arabic language only that appropriately complete each request in a valid, parsable JSON format with two attributes, one will be "reasoning" which is your thought process, 
    the other is the "solution" that has only a letter (a, b, c or d) in English, which represents the option you chose for the solution based on the options provided in the question.

### Question:
{INPUT}

### Options:
{OPTIONS}

### Solution JSON:
"""
    prompt = chat_template.replace("{INPUT}", instruction)
    prompt = prompt.replace("{OPTIONS}", options)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start = time.time()
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )
    end = time.time()
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text, end - start, len(tokenizer.tokenize(output_text))

In [4]:
def evaluate(model, tokenizer, dataset, output_path="generations.jsonl", max_samples=5):
    correct = 0
    total = 0
    total_time = 0
    total_tokens = 0

    with open(output_path, "w", encoding="utf-8") as outfile:
        for example in tqdm(dataset.select(range(max_samples))):
            question = example["Question"]
            options = [
                example["Option 1"],
                example["Option 2"],
                example["Option 3"],
                example["Option 4"],
            ]
            answer = example["Answer Key"]

            instruction = f"{question}\n"
            options_str = ""
            for i, opt in enumerate(options):
                options_str += f"{chr(97+i)}. {opt}\n"  # a, b, c, d

            response, elapsed_time, token_len = generate_response(model, tokenizer, instruction, options_str)
            try:
                response_json = json.loads(response.strip().split("### Solution:")[-1])
                solution = response_json.get("solution", "").lower()
            except Exception:
                response_json = None
                solution = None

            log_entry = {
                "id": example["ID"],
                "question": question,
                "options": {
                    "a": options[0],
                    "b": options[1],
                    "c": options[2],
                    "d": options[3],
                },
                "answer_key": answer.lower(),
                "generated_text": response,
            }

            outfile.write(json.dumps(log_entry, ensure_ascii=False) + "\n")

            if solution == answer.lower():
                correct += 1
            total += 1
            total_time += elapsed_time
            total_tokens += token_len

    print(f"Accuracy: {correct / total * 100:.2f}%")
    print(f"Average token length: {total_tokens / total:.2f} tokens")
    print(f"Average compute time: {total_time / total:.2f} seconds")
    print(f"Output saved to: {output_path}")

In [5]:
model, tokenizer = load_model()

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


In [6]:
ds = load_dataset("MBZUAI/ArabicMMLU", "All")["dev"]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/6.69M [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/49.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14455 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/120 [00:00<?, ? examples/s]

In [7]:
evaluate(model, tokenizer, ds, max_samples=120)  # Set this higher for full benchmark

100%|██████████| 120/120 [40:30<00:00, 20.25s/it] 

Accuracy: 0.00%
Average token length: 835.89 tokens
Average compute time: 20.25 seconds
Output saved to: generations.jsonl



