In [None]:
# Original installations with minor cleanup and ALLaM compatibility
!pip install transformers datasets torch accelerate bitsandbytes wandb arabic-reshaper python-bidi
!pip install git+https://github.com/MagedSaeed/Bohour.git
!pip install -U transformers sentencepiece accelerate datasets evaluate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Collecting python-bidi
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
 

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from datasets import load_dataset
import json
import pandas as pd
import collections
import os
import random
from tqdm.auto import tqdm

In [None]:
def load_tokens():
    if not os.path.exists("meter_tokens.json"):
        meter_tokens = {
            "الخفيف": "<|meter_0|>", "المضارع": "<|meter_1|>", "المجتث": "<|meter_2|>",
            "الرمل": "<|meter_3|>", "البسيط": "<|meter_4|>", "المتقارب": "<|meter_5|>",
            "الوافر": "<|meter_6|>", "المقتضب": "<|meter_7|>", "المديد": "<|meter_8|>",
            "النثر": "<|meter_9|>", "الهزج": "<|meter_10|>", "المتدارك": "<|meter_11|>",
            "المنسرح": "<|meter_12|>", "الطويل": "<|meter_13|>", "الكامل": "<|meter_14|>",
            "الرجز": "<|meter_15|>", "السريع": "<|meter_16|>"
        }
        with open("meter_tokens.json", "w", encoding="utf-8") as f:
            json.dump(meter_tokens, f, ensure_ascii=False)
    else:
        with open("meter_tokens.json", "r", encoding="utf-8") as f:
            meter_tokens = json.load(f)

    if not os.path.exists("theme_tokens.json"):
        theme_tokens = {
            "قصيدة قصيره": "<|theme_0|>", "قصيدة مدح": "<|theme_1|>",
            "قصيدة وطنيه": "<|theme_2|>", "قصيدة رومنسيه": "<|theme_3|>",
            "قصيدة هجاء": "<|theme_4|>", "قصيدة اعتذار": "<|theme_5|>",
            "قصيدة سياسية": "<|theme_6|>", "قصيدة فراق": "<|theme_7|>",
            "قصيدة غزل": "<|theme_8|>", "قصيدة ذم": "<|theme_9|>",
            "قصيدة رثاء": "<|theme_10|>", None: "<|theme_11|>",
            "قصيدة شوق": "<|theme_12|>", "قصيدة المعلقات": "<|theme_13|>",
            "قصيدة الاناشيد": "<|theme_14|>", "قصيدة حزينه": "<|theme_15|>",
            "قصيدة عتاب": "<|theme_16|>", "قصيدة عامه": "<|theme_17|>",
            "قصيدة دينية": "<|theme_18|>"
        }
        with open("theme_tokens.json", "w", encoding="utf-8") as f:
            json.dump(theme_tokens, f, ensure_ascii=False)
    else:
        with open("theme_tokens.json", "r", encoding="utf-8") as f:
            theme_tokens = json.load(f)

    ST_POEM_TOKEN = '<|psep|>'
    ED_POEM_TOKEN = '</|psep|>'
    VERSE_TOKEN = '<|vsep|>'
    ST_BAYT_TOKEN = '<|bsep|>'
    ED_BAYT_TOKEN = '</|bsep|>'

    return meter_tokens, theme_tokens, ST_POEM_TOKEN, ED_POEM_TOKEN, VERSE_TOKEN, ST_BAYT_TOKEN, ED_BAYT_TOKEN

meter_tokens, theme_tokens, ST_POEM_TOKEN, ED_POEM_TOKEN, VERSE_TOKEN, ST_BAYT_TOKEN, ED_BAYT_TOKEN = load_tokens()

In [None]:
def load_model():
    model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load model with BF16 precision (ALLaM’s recommendation)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,  # Changed from float16 to match ALLaM’s native precision
        device_map="auto",
        load_in_8bit=True  # Kept for memory efficiency
    )

    # Ensure padding token exists (ALLaM may not define it)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "to_out.0"]
    )

    # Apply LoRA adapters
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer

model, tokenizer = load_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.03G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

trainable params: 12,582,912 || all params: 7,013,142,528 || trainable%: 0.1794


In [None]:
def load_dataset_and_preprocess():
    ashaar = load_dataset("arbml/ashaar")

    def process_verse(sample):
        chars = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويىئءأؤة ى'
        diacs = 'ْ~ًٌٍَُِّ'
        map_chars = {'ک':'ك', 'ﺑ':'ب', 'ٹ':'ث', 'ی':'ى'}
        out = []
        for verse in sample['poem verses']:
            proc_verse = ''
            for char in verse:
                if char in chars+diacs:
                    proc_verse += char
                elif char in map_chars:
                    proc_verse += map_chars[char]
            out.append(proc_verse)
        sample['poem verses'] = out
        return sample

    def filter_poems(sample):
        poem = sample['poem verses']
        if len(poem) < 2:
            return False
        if len(poem) % 2 != 0:
            return False
        for verse in poem:
            if len(verse) < 5:
                return False
        return True

    def map_meters(sample):
        meter = sample['poem meter']
        if meter:
            for label in meter_tokens.keys():
                if label in meter:
                    sample['poem meter'] = label
            if meter == 'بسيط':
                sample['poem meter'] = 'البسيط'
            if 'خبب' in meter:
                sample['poem meter'] = "المتدارك"
            if meter in ['نثرية', 'شعر التفعيلة', 'شعر الحر', 'بحر التفعيلة', 'التفعيله']:
                sample['poem meter'] = "النثر"
        return sample

    def filter_meters(sample):
        meter = sample['poem meter']
        for m in meter_tokens.keys():
            if m == meter:
                return True
        return False

    def get_qafiyah_majority(poem):
        try:
            from bohour.qafiah import get_qafiyah
            all_qafiyahs = []
            for bayt in poem.split(ED_BAYT_TOKEN)[:-1]:
                proc_bayt = bayt.replace(VERSE_TOKEN, '').replace(ST_BAYT_TOKEN, '').replace(ED_BAYT_TOKEN,'')
                all_qafiyahs.append(get_qafiyah([proc_bayt])[0][0])
            return collections.Counter(all_qafiyahs).most_common(1)[0][0]
        except:
            return 'ن'

    def join_verses(sample):
        verses = sample['poem verses']
        meter = sample['poem meter']
        theme = sample['poem theme']
        title = sample.get('poem title', 'بدون عنوان')  # Use 'بدون عنوان' if title is missing

        if not meter or not theme or theme not in theme_tokens:
            return {"prompt": "", "completion": ""}

        poem = ''.join([f'{ST_BAYT_TOKEN} '+verses[i] +f' {VERSE_TOKEN} '+ verses[i+1]+ f' {ED_BAYT_TOKEN} '
                        for i in range(0, len(verses)-1, 2)])

        try:
            qafiyah = get_qafiyah_majority(poem)
            if not qafiyah:
                qafiyah = 'ن'
        except:
            qafiyah = 'ن'

        # Updated prompt with poem title
        prompt = f"""قم بإنشاء قصيدة عربية حسب المواصفات التالية:
العنوان: {title}
البحر: {meter}
الموضوع: {theme}
القافية: {qafiyah}
النمط: شعر عربي فصيح مع الالتزام بالبحر والقافية


أنشئ قصيدة جديدة تتبع نفس النمط والقافية:"""

        completion = poem.strip()

        return {"prompt": prompt, "completion": completion}

    ashaar = ashaar.map(process_verse)
    ashaar = ashaar.filter(filter_poems)
    ashaar = ashaar.map(map_meters)
    ashaar = ashaar.filter(filter_meters)
    processed_data = ashaar.map(join_verses)

    processed_data = processed_data.filter(lambda x: x["prompt"] != "" and x["completion"] != "")

    return processed_data

processed_data = load_dataset_and_preprocess()

README.md:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/34.7k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/126M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/151M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/254630 [00:00<?, ? examples/s]

Map:   0%|          | 0/254630 [00:00<?, ? examples/s]

Filter:   0%|          | 0/254630 [00:00<?, ? examples/s]

Map:   0%|          | 0/219946 [00:00<?, ? examples/s]

Filter:   0%|          | 0/219946 [00:00<?, ? examples/s]

Map:   0%|          | 0/142074 [00:00<?, ? examples/s]

Filter:   0%|          | 0/142074 [00:00<?, ? examples/s]

In [None]:
class PoetryDataset(torch.utils.data.Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        prompt = example["prompt"]
        completion = example["completion"]

        # Format as instruction tuning (unchanged)
        full_text = f"{prompt}\n{completion}"

        # Tokenize
        encodings = self.tokenizer(
            full_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encodings["input_ids"][0]
        attention_mask = encodings["attention_mask"][0]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone()
        }

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the save path in Drive (updated for ALLaM)
save_path = "/content/drive/MyDrive/allam_fine_tuned_Ashar"

# Create the folder if it doesn’t exist
os.makedirs(save_path, exist_ok=True)

Mounted at /content/drive


In [None]:

print(processed_data["train"]['completion'][0])


<|bsep|> أَصبَحَ المُلك لِلَّذي فَطر الخَل <|vsep|> قَ بِتَقديرٍ للعَزيز العَليمِ </|bsep|> <|bsep|> غافر الذَنب للمسيءِ بِعَفوٍ <|vsep|> قابل التَوب ذي العَطاء العَميمِ </|bsep|> <|bsep|> مُرسل المُصطَفى البَشير ِلَينا <|vsep|> رَحمة مِنهُ بِالكَلام القَديمِ </|bsep|> <|bsep|> رَبَنا رَبّنا ِلَيكَ أَنينا <|vsep|> فَأَجرنا مِن حَر نار الجَحيمِ </|bsep|> <|bsep|> وَاكفِنا شَرّ ما نَخاف بِلُطفٍ <|vsep|> يا عَظيماً يَرجى لِكُل عَظيمِ </|bsep|> <|bsep|> وَتَقبل أَعمالَنا وَاعفُ عَنا <|vsep|> وَأَنلنا دُخول دار النَعيمِ </|bsep|> <|bsep|> بِنَبي بَعثَتهُ فَهَدانا <|vsep|> لِصِراط مِن الهُدى مُستَقيمِ </|bsep|> <|bsep|> وَبِمَن نَحنُ في حِماهُ مَدى الدَهر <|vsep|> أَخيهِ يَحيى الحصور الكَريمِ </|bsep|> <|bsep|> أَدرك أَدرك قَوماً أَتوا بافتقار <|vsep|> وَاِنكِسار وَمَدمَع مَسجومِ </|bsep|> <|bsep|> شَهدت أَرواحَهُم أَنكَ اللَهُ <|vsep|> وَجاءوا بِكُل قَلبٍ سَليم </|bsep|>


In [None]:
def setup_training(processed_data, tokenizer):
    train_dataset = PoetryDataset(
        processed_data["train"],
        tokenizer,
        max_length=512
    )

    training_args = TrainingArguments(
        output_dir=save_path,
        num_train_epochs=2,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=2e-5,
        warmup_steps=500,
        bf16=True,  # Changed to match ALLaM’s precision
        fp16=False,  # Disabled since bf16 is used
        report_to="wandb",
        logging_dir=os.path.join(save_path, "logs"),
        dataloader_num_workers=2,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    return train_dataset, training_args, data_collator

train_dataset, training_args, data_collator = setup_training(processed_data, tokenizer)

In [None]:
def train_model(model, train_dataset, training_args, data_collator):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    trainer.train()

    # Updated save path for ALLaM within Google Drive
    save_path_drive = "/content/drive/MyDrive/allam_fine_tuned_Ashar/allam-poetry-final"  # Organize within a folder
    os.makedirs(save_path_drive, exist_ok=True)
    model.save_pretrained(save_path_drive)
    tokenizer.save_pretrained(save_path_drive)

    return trainer

trainer = train_model(model, train_dataset, training_args, data_collator)

NameError: name 'model' is not defined

In [None]:
# wandb tokens :
# d17f2f236eb372f56dab22b1f5a925117ab4e37f