In [None]:
# Original installations with minor cleanup and ALLaM compatibility
!pip install transformers datasets torch accelerate bitsandbytes wandb arabic-reshaper python-bidi
!pip install git+https://github.com/MagedSaeed/Bohour.git
!pip install -U transformers sentencepiece accelerate datasets evaluate

Collecting git+https://github.com/MagedSaeed/Bohour.git
  Cloning https://github.com/MagedSaeed/Bohour.git to /tmp/pip-req-build-thtozljw
  Running command git clone --filter=blob:none --quiet https://github.com/MagedSaeed/Bohour.git /tmp/pip-req-build-thtozljw
  Resolved https://github.com/MagedSaeed/Bohour.git to commit 350ea7305a815503bab0f099497da5d0a974fd1b
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from datasets import load_dataset
import json
import pandas as pd
import collections
import os
import random
from tqdm.auto import tqdm

In [None]:
def load_tokens():
    if not os.path.exists("meter_tokens.json"):
        meter_tokens = {
            "الخفيف": "<|meter_0|>", "الطويل": "<|meter_13|>", "الكامل": "<|meter_14|>",
            "البسيط": "<|meter_4|>", "السريع": "<|meter_16|>", "الوافر": "<|meter_6|>"
        }
        with open("meter_tokens.json", "w", encoding="utf-8") as f:
            json.dump(meter_tokens, f, ensure_ascii=False)
    else:
        with open("meter_tokens.json", "r", encoding="utf-8") as f:
            meter_tokens = json.load(f)

    if not os.path.exists("theme_tokens.json"):
        theme_tokens = {
            "قصيدة قصيره": "<|theme_0|>", "قصيدة مدح": "<|theme_1|>",
            "قصيدة وطنيه": "<|theme_2|>", "قصيدة رومنسيه": "<|theme_3|>",
            "قصيدة هجاء": "<|theme_4|>", "قصيدة اعتذار": "<|theme_5|>",
            "قصيدة سياسية": "<|theme_6|>", "قصيدة فراق": "<|theme_7|>",
            "قصيدة غزل": "<|theme_8|>", "قصيدة ذم": "<|theme_9|>",
            "قصيدة رثاء": "<|theme_10|>", None: "<|theme_11|>",
            "قصيدة شوق": "<|theme_12|>", "قصيدة المعلقات": "<|theme_13|>",
            "قصيدة الاناشيد": "<|theme_14|>", "قصيدة حزينه": "<|theme_15|>",
            "قصيدة عتاب": "<|theme_16|>", "قصيدة عامه": "<|theme_17|>",
            "قصيدة دينية": "<|theme_18|>"
        }
        with open("theme_tokens.json", "w", encoding="utf-8") as f:
            json.dump(theme_tokens, f, ensure_ascii=False)
    else:
        with open("theme_tokens.json", "r", encoding="utf-8") as f:
            theme_tokens = json.load(f)

    ST_POEM_TOKEN = '<|psep|>'
    ED_POEM_TOKEN = '</|psep|>'
    VERSE_TOKEN = '<|vsep|>'
    ST_BAYT_TOKEN = '<|bsep|>'
    ED_BAYT_TOKEN = '</|bsep|>'

    return meter_tokens, theme_tokens, ST_POEM_TOKEN, ED_POEM_TOKEN, VERSE_TOKEN, ST_BAYT_TOKEN, ED_BAYT_TOKEN

meter_tokens, theme_tokens, ST_POEM_TOKEN, ED_POEM_TOKEN, VERSE_TOKEN, ST_BAYT_TOKEN, ED_BAYT_TOKEN = load_tokens()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import torch
import os

def load_model():
    model_name = "CohereForAI/c4ai-command-r7b-12-2024"

    # Set your Hugging Face token
    hf_token = ""

    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=hf_token,
        load_in_8bit=True
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    model = prepare_model_for_kbit_training(model)

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=target_modules
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer

model, tokenizer = load_model()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 13,631,488 || all params: 8,041,664,512 || trainable%: 0.1695


In [None]:
import collections
from datasets import load_dataset

def load_dataset_and_preprocess():
    ashaar = load_dataset("arbml/ashaar")

    selected_meters = ["الخفيف", "الطويل", "الكامل", "البسيط", "السريع", "الوافر"]

    def process_verse(sample):
        chars = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويىئءأؤة ى'
        diacs = 'ْ~ًٌٍَُِّ'
        map_chars = {'ک': 'ك', 'ﺑ': 'ب', 'ٹ': 'ث', 'ی': 'ى'}
        out = []
        for verse in sample['poem verses']:
            proc_verse = ''
            for char in verse:
                if char in chars + diacs:
                    proc_verse += char
                elif char in map_chars:
                    proc_verse += map_chars[char]
            out.append(proc_verse)
        sample['poem verses'] = out
        return sample

    def filter_poems(sample):
        poem = sample['poem verses']
        if len(poem) < 2 or len(poem) % 2 != 0:
            return False
        return all(len(verse) >= 5 for verse in poem)

    def map_meters(sample):
        meter = sample['poem meter']
        if meter:
            if meter == 'بسيط':
                sample['poem meter'] = 'البسيط'
            elif 'خفيف' in meter:
                sample['poem meter'] = 'الخفيف'
            elif 'طويل' in meter:
                sample['poem meter'] = 'الطويل'
            elif 'كامل' in meter:
                sample['poem meter'] = 'الكامل'
            elif 'سريع' in meter:
                sample['poem meter'] = 'السريع'
            elif 'وافر' in meter:
                sample['poem meter'] = 'الوافر'
        return sample

    def filter_meters(sample):
        return sample['poem meter'] in selected_meters

    def check_qafiyah(verses):
        rhymes = []
        for i in range(1, len(verses), 2):
            line = verses[i].strip()
            for char in reversed(line):
                if char.isalpha():
                    rhymes.append(char)
                    break
        if not rhymes:
            return False
        most_common = collections.Counter(rhymes).most_common(1)[0]
        return most_common[1] / len(rhymes) >= 0.7

    def filter_by_qafiyah(sample):
        return check_qafiyah(sample['poem verses'])

    def get_qafiyah_majority(poem):
        qafiyahs = []
        for bayt in poem.split(ED_BAYT_TOKEN):
            if VERSE_TOKEN in bayt:
                parts = bayt.split(VERSE_TOKEN)
                if len(parts) >= 2:
                    second_half = parts[1].strip()
                    for char in reversed(second_half):
                        if char.isalpha():
                            qafiyahs.append(char)
                            break
        if not qafiyahs:
            return None
        return collections.Counter(qafiyahs).most_common(1)[0][0]

    def join_verses(sample):
        verses = sample['poem verses']
        meter = sample['poem meter']
        theme = sample['poem theme']
        title = sample.get('poem title', 'بدون عنوان')

        if meter not in meter_tokens or theme not in theme_tokens:
            return {"prompt": "", "completion": ""}

        poem = ''.join([f'{ST_BAYT_TOKEN} {verses[i]} {VERSE_TOKEN} {verses[i+1]} {ED_BAYT_TOKEN} '
                        for i in range(0, len(verses) - 1, 2)])

        qafiyah = get_qafiyah_majority(poem)
        if not qafiyah:
            return {"prompt": "", "completion": ""}

        prompt = f"""أنشئ قصيدة عربية فصيحة وفقاً للمواصفات التالية:

العنوان: {title}
البحر: {meter}
نوع القصيدة: {theme}
القافية: {qafiyah}

يجب أن تكون القصيدة:
- ملتزمة بقواعد بحر {meter} وتفعيلاته
- منتهية كل بيت بحرف {qafiyah}
- متناسبة مع موضوع :{title}

الهيكل المطلوب:
{ST_BAYT_TOKEN} الشطر الأول {VERSE_TOKEN} الشطر الثاني {ED_BAYT_TOKEN}

اكتب القصيدة:"""

        completion = poem.strip()

        return {"prompt": prompt, "completion": completion}

    # Apply steps
    ashaar = ashaar.map(process_verse)
    ashaar = ashaar.filter(filter_poems)
    ashaar = ashaar.map(map_meters)
    ashaar = ashaar.filter(filter_meters)
    ashaar = ashaar.filter(filter_by_qafiyah)
    processed_data = ashaar.map(join_verses)
    processed_data = processed_data.filter(lambda x: x["prompt"] != "" and x["completion"] != "")

    return processed_data

# Load preprocessed dataset
processed_data = load_dataset_and_preprocess()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/94931 [00:00<?, ? examples/s]

Filter:   0%|          | 0/94931 [00:00<?, ? examples/s]

In [None]:

class PoetryDataset(torch.utils.data.Dataset):
    def __init__(self, examples, tokenizer, max_length=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        prompt = example["prompt"]
        completion = example["completion"]

        # Format using the chat template format for C4AI Command model
        messages = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": completion}
        ]

        # Apply the chat template with generation prompt
        input_text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )

        # Tokenize
        encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encodings["input_ids"][0]
        attention_mask = encodings["attention_mask"][0]

        # Create labels - set to -100 for user inputs to avoid training on them
        labels = input_ids.clone()

        # Find where the assistant's response starts
        # This will depend on the exact formatting used by the tokenizer's chat template
        assistant_start = input_text.find("<|assistant|>")
        if assistant_start != -1:
            # Get token position where assistant response starts
            assistant_token_pos = len(self.tokenizer(input_text[:assistant_start],
                                                    add_special_tokens=False)["input_ids"])
            # Set labels for non-assistant text to -100
            labels[:assistant_token_pos] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define paths for saving checkpoints and models
DRIVE_BASE_PATH = "/content/drive/MyDrive/arabic_poetry_model"
CHECKPOINT_PATH = f"{DRIVE_BASE_PATH}/checkpoints"
FINAL_MODEL_PATH = f"{DRIVE_BASE_PATH}/final_model"

# Create directories if they don't exist
os.makedirs(DRIVE_BASE_PATH, exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(FINAL_MODEL_PATH, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
processed_data


DatasetDict({
    train: Dataset({
        features: ['poem title', 'poem meter', 'poem verses', 'poem theme', 'poem url', 'poet name', 'poet description', 'poet url', 'poet era', 'poet location', 'poem description', 'poem language type', 'prompt', 'completion'],
        num_rows: 94931
    })
})

In [None]:
from pprint import pprint
pprint(processed_data['train']['prompt'][1000])
pprint(processed_data["train"]['completion'][1000])


('أنشئ قصيدة عربية فصيحة وفقاً للمواصفات التالية:\n'
 '\n'
 'العنوان: وركب يزجرون على وجاها\n'
 'البحر: الوافر\n'
 'نوع القصيدة: قصيدة عامه\n'
 'القافية: ا\n'
 '\n'
 'يجب أن تكون القصيدة:\n'
 '- ملتزمة بقواعد بحر الوافر وتفعيلاته\n'
 '- منتهية كل بيت بحرف ا\n'
 '- متناسبة مع موضوع :وركب يزجرون على وجاها\n'
 '\n'
 'الهيكل المطلوب:\n'
 '<|bsep|> الشطر الأول <|vsep|> الشطر الثاني </|bsep|>\n'
 '\n'
 'اكتب القصيدة:')
('<|bsep|> وَرَكبٍ يَزجُرونَ عَلى وَجاها <|vsep|> بِقارِعَةِ النَّقا قُلُصاً '
 'عِجالا </|bsep|> <|bsep|> فَحالَتْ دونَهُم تَلَعاتُ نَجدٍ <|vsep|> كَما '
 'وارَيتَ بِالقُرُبِ النِّصالا </|bsep|> <|bsep|> حَمَلنَ مِنَ الظِّباءِ '
 'العِينِ سِرباً <|vsep|> وَقَد عُوِّضنَ عَن كُنُسٍ رِحالا </|bsep|> <|bsep|> '
 'وَفي الأَحداجِ بَدرٌ مِن هِلالٍ <|vsep|> ضَمَمنَ ِلَيهِ مِن بَدرٍ هِلالا '
 '</|bsep|> <|bsep|> وَغانيَةٍ لَه سِرٌّ مَصونٌ <|vsep|> أُكَفكِفُ عَنهُ لي '
 'دَمعاً مُذالا </|bsep|> <|bsep|> تُواصِلُني وَما بالنَّجمِ مَيلٌ <|vsep|> '
 'وَتَهجُرُني ِذا ما النَّجمُ مالا </|bs

In [None]:
reduced_data = processed_data["train"].remove_columns(
        [col for col in processed_data["train"].column_names if col not in ["prompt", "completion"]]
    )

In [None]:
reduced_data

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 94931
})

In [None]:
def setup_training(processed_data, tokenizer):
    # Keep only prompt and completion columns
    reduced_data = processed_data["train"].remove_columns(
        [col for col in processed_data["train"].column_names if col not in ["prompt", "completion"]]
    )

    train_dataset = PoetryDataset(
        reduced_data,
        tokenizer,
        max_length=1024
    )

    training_args = TrainingArguments(
        output_dir=CHECKPOINT_PATH,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=2e-5,
        warmup_steps=500,
        bf16=True,
        fp16=False,
        report_to="wandb",
        logging_dir=os.path.join(DRIVE_BASE_PATH, "logs"),
        dataloader_num_workers=2,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )


    return train_dataset, training_args, data_collator

train_dataset, training_args, data_collator = setup_training(processed_data, tokenizer)

In [None]:
def train_model(model, train_dataset, training_args, data_collator):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    trainer.train()

    # Updated save path for ALLaM within Google Drive
    save_path_drive = FINAL_MODEL_PATH  # Organize within a folder
    os.makedirs(save_path_drive, exist_ok=True)
    model.save_pretrained(save_path_drive)
    tokenizer.save_pretrained(save_path_drive)

    return trainer

trainer = train_model(model, train_dataset, training_args, data_collator)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhatimalhomid[0m ([33mhatimalhomid-education-com[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
100,2.1822
200,1.9185
300,1.4497
400,1.3737
500,1.317
600,1.2906
700,1.252
800,1.2529
900,1.2461
1000,1.2683



Cannot access gated repo for url https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024/resolve/main/config.json.
Access to model CohereForAI/c4ai-command-r7b-12-2024 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in CohereForAI/c4ai-command-r7b-12-2024.
  return fn(*args, **kwargs)

Cannot access gated repo for url https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024/resolve/main/config.json.
Access to model CohereForAI/c4ai-command-r7b-12-2024 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in CohereForAI/c4ai-command-r7b-12-2024.
  return fn(*args, **kwargs)

Cannot access gated repo for url https://huggingface.co/CohereForAI/c4ai-command-r7b-12-2024/resolve/main/config.json.
Access to model CohereForAI/c4ai-command-r7b-12-2024 is restricted. You must have access to it and

KeyboardInterrupt: 