# 1. Install Dependencies

In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes --quiet
!pip install rich --quiet

# 2. Imports

In [2]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Verification
assert torch.cuda.is_available(), "GPU is not detected!"
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU Detected: {gpu_stats.name}")
print(f"Total VRAM: {round(gpu_stats.total_memory / 1024**3, 2)} GB")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
GPU Detected: NVIDIA H200 MIG 1g.18gb
Total VRAM: 16.0 GB


# 3. Model & Tokenizer Initialization

In [3]:
max_seq_length = 512
load_in_4bit = True

print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-3-4b-pt",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = load_in_4bit,
)

# Verification
assert model is not None and tokenizer is not None, "Model or tokenizer failed to load."

Loading model...
==((====))==  Unsloth 2026.2.1: Fast Gemma3 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA H200 MIG 1g.18gb. Num GPUs = 2. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# 5. LoRA Adapter Injection

In [4]:
from unsloth import FastLanguageModel

model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)

print("LoRA Adapters injected successfully. Parameter breakdown:")
model.print_trainable_parameters()

Unsloth: Making `base_model.model.model.vision_tower.vision_model` require gradients
LoRA Adapters injected successfully. Parameter breakdown:
trainable params: 131,153,920 || all params: 4,431,233,392 || trainable%: 2.9598


# 6. Dataset Loading & Formatting

In [5]:
import os
from datasets import load_dataset

train_file = "./train.jsonl"
val_file = "./val.jsonl"

# Verification 1: File Existence
assert os.path.exists(train_file), f"Could not find train file at {train_file}"
assert os.path.exists(val_file), f"Could not find val file at {val_file}"
print("Successfully located dataset files.")

dataset = load_dataset("json", data_files={"train": train_file, "val": val_file})

text_tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer

def formatting_func(examples):
    texts = []
    for poem in examples["poem"]:
        clean_poem = poem.replace("[", "(").replace("]", ")") 
        clean_poem = clean_poem.strip()
        text = clean_poem + text_tokenizer.eos_token
        texts.append(text)
        
    return { "text" : texts }

formatted_dataset = dataset.map(formatting_func, batched = True)

# Verification 2: Data Formatting
sample_text = formatted_dataset["train"]["text"][0]
print("Dataset formatted. Here is the exact string the model will train on:")
print("-" * 50)
print(sample_text)
print("-" * 50)
assert sample_text.endswith(text_tokenizer.eos_token), "Missing EOS token at the end of the data!"

full_train_dataset = formatted_dataset["train"].shuffle(seed=42)
print(f"Total training examples ready: {len(full_train_dataset)}")

Successfully located dataset files.
Dataset formatted. Here is the exact string the model will train on:
--------------------------------------------------
ÿ£ŸÑŸÖ ÿ™ŸÑŸÖŸÖ ÿπŸÑŸâ ÿßŸÑÿ∑ŸÑŸÑ ÿßŸÑŸÖÿ≠ŸäŸÑ (ÿßŸÑŸàÿßŸÅÿ±) ÿ®ÿ∫ÿ±ÿ®Ÿä ÿßŸÑÿ£ÿ®ÿßÿ±ŸÇ ŸÖŸÜ ÿ≠ŸÇŸäŸÑ (ŸÑ) ÿµÿ±ŸÅÿ™ ÿ®ÿµÿßÿ≠ÿ®Ÿä ÿ∑ÿ±ÿ®ÿß ÿ•ŸÑŸäŸáÿß (ÿßŸÑŸàÿßŸÅÿ±) ŸàŸÖÿß ÿ∑ÿ±ÿ® ÿßŸÑÿ≠ŸÑŸäŸÖ ÿ•ŸÑŸâ ÿßŸÑÿ∑ŸÑŸàŸÑ (ŸÑ) ŸÅŸÑŸÖ ÿ£ÿ± ÿ∫Ÿäÿ± ÿ¢ŸÜÿßÿ° ÿ£ÿ≠ÿßÿ∑ÿ™ (ÿßŸÑŸàÿßŸÅÿ±) ÿπŸÑŸâ ÿßŸÑÿπÿ±ÿµÿßÿ™ ŸÖŸÜ ÿ≠ÿ∞ÿ± ÿßŸÑÿ≥ŸäŸàŸÑ (ŸÑ) ÿ™ŸÜÿ≥ŸÅŸáÿß ÿßŸÑÿ®Ÿàÿßÿ±ÿ≠ ŸÅŸáŸä ÿØŸÅ (ÿßŸÑŸàÿßŸÅÿ±) ÿ£ÿ¥ŸÑ ŸàÿØŸÅ ŸÖÿÆÿ™ÿ¥ÿπ ÿ∞ŸÑŸàŸÑ (ŸÑ) Ÿàÿ±ÿ≥ŸÖ ŸÖÿ®ÿßÿ°ÿ© Ÿàÿ±ŸÖÿßÿØ ŸÜÿßÿ± (ÿßŸÑŸàÿßŸÅÿ±) Ÿàÿ¨ŸàŸÜ ÿ≠ŸàŸÑ ŸÖŸàŸÇÿØŸáÿß ŸÖÿ´ŸàŸÑ (ŸÑ) ÿØŸäÿßÿ± ŸÖŸÜ ÿ£ŸÖÿßŸÖÿ© ÿ•ÿ∞ ÿ±ŸÖÿ™ŸÜÿß (ÿßŸÑŸàÿßŸÅÿ±) ÿ®ÿ≥ŸáŸÖ ŸÅŸä ŸÖÿ®ÿßÿπÿØÿ© ŸÇÿ™ŸàŸÑ (ŸÑ) ÿ±ŸÖŸäÿ™ ÿ®ŸÖŸÇŸÑÿ™ŸäŸÉ ÿßŸÑŸÇŸÑÿ® ÿ≠ÿ™Ÿâ (ÿßŸÑŸàÿßŸÅÿ±) ÿ£ÿµÿ®ÿ™ ÿßŸÑŸÇŸÑÿ® ÿ®ÿßŸÑÿ´ŸÇŸÑ ÿßŸÑŸÉŸÑŸäŸÑ (ŸÑ) ŸÅŸÑŸÖÿß ÿ£ŸÜ ŸÜÿ≤ŸÑÿ™ ÿ¥ÿπÿßÿ® ŸÇŸÑÿ®Ÿä (ÿßŸÑŸàÿßŸÅÿ±) ŸÖÿØÿØÿ™ ŸÑŸÜÿß ŸÖÿ®ÿßÿπÿØÿ© ÿßŸÑÿ®ÿÆŸäŸÑ (ŸÑ) ÿ≥ŸÖÿπÿ™ ŸÖŸÇÿßŸÑÿ© ÿßŸÑ

# 7. Training Setup and Execution

In [9]:
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"

import torch, gc
gc.collect(); torch.cuda.empty_cache()

model.config.use_cache = False
if hasattr(model.config, "text_config"):
    model.config.text_config._attn_implementation = "eager"
else:
    model.config._attn_implementation = "eager"

if getattr(text_tokenizer, "pad_token", None) is None:
    text_tokenizer.pad_token = text_tokenizer.eos_token

FastLanguageModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = text_tokenizer,
    train_dataset = full_train_dataset,
    dataset_text_field = "text",
    max_seq_length = 256,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8, 
        max_steps = 800,
        num_train_epochs = 3,
        learning_rate = 2e-5,
        lr_scheduler_type = "cosine",
        max_grad_norm = 0.5,
        warmup_steps = 50,
        warmup_ratio = 0.1,
        fp16 = True,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        logging_steps = 10,
        output_dir = "outputs",
        save_strategy = "no",
        eval_strategy = "no",
        torch_compile = False,
        report_to = "none",
        average_tokens_across_devices = False,
    ),
)

print("Starting training....")
trainer.train()

Starting training....


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 161,065 | Num Epochs = 1 | Total steps = 800
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 131,153,920 of 4,431,233,392 (2.96% trained)


Step,Training Loss
10,5.7506
20,5.7924
30,5.78
40,5.8171
50,5.7692
60,5.7788
70,5.8532
80,5.8302
90,5.8281
100,5.8489


TrainOutput(global_step=800, training_loss=5.901845726966858, metrics={'train_runtime': 3261.8684, 'train_samples_per_second': 3.924, 'train_steps_per_second': 0.245, 'total_flos': 7.3831461421056e+16, 'train_loss': 5.901845726966858, 'epoch': 0.07947052760979971})

# 8. Save the fine-tuned Model Locally

In [10]:
save_directory = "./model/"

# Save the model and the tokenizer
model.save_pretrained(save_directory)
text_tokenizer.save_pretrained(save_directory)

print(f"Model successfully saved to: {save_directory}")

Model successfully saved to: ./model/


# 9. Test Poetry Generation

In [14]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
from unsloth import FastLanguageModel

class VerseCountStoppingCriteria(StoppingCriteria):
    def __init__(self, rhyme_token_ids, target_verses):
        self.rhyme_token_ids = set(rhyme_token_ids)
        self.target_verses = target_verses
        self.verse_count = 0 

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        last_token = input_ids[0][-1].item()
        if last_token in self.rhyme_token_ids:
            self.verse_count += 1
        return self.verse_count >= self.target_verses

FastLanguageModel.for_inference(model)

prompt_text = "ÿßŸÑÿπŸÑŸÖ ÿ≤ŸäŸÜ Ÿàÿ™ÿ¥ÿ±ŸäŸÅ ŸÑÿµÿßÿ≠ÿ®Ÿá (" 
N_verses = 3

inputs = text_tokenizer(prompt_text, return_tensors="pt").to("cuda")

rhyme_tags = ["(ÿ°)", "(ÿß)", "(ÿ®)", "(ÿ™)", "(ÿ´)", "(ÿ¨)", "(ÿ≠)", "(ÿÆ)", "(ÿØ)", "(ÿ∞)",
              "(ÿ±)", "(ÿ≤)", "(ÿ≥)", "(ÿ¥)", "(ÿµ)", "(ÿ∂)", "(ÿ∑)", "(ÿ∏)", "(ÿπ)", "(ÿ∫)",
              "(ŸÅ)", "(ŸÇ)", "(ŸÉ)", "(ŸÑ)", "(ŸÖ)", "(ŸÜ)", "(Ÿá)", "(Ÿà)", "(Ÿä)"]

rhyme_token_ids = []
for tag in rhyme_tags:
    encoded = text_tokenizer.encode(tag, add_special_tokens=False)
    if encoded:
        rhyme_token_ids.append(encoded[-1]) 

stopper = VerseCountStoppingCriteria(rhyme_token_ids, N_verses)

print(f"Generating verses...")
outputs = model.generate(
    **inputs,
    max_new_tokens = 128,
    do_sample = True,

    temperature = 0.2,
    top_p = 0.8,
    repetition_penalty = 1.3,
    
    use_cache = True,
)

poem = text_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n" + "="*50)
print(poem)
print("="*50)

Generating verses...

ÿßŸÑÿπŸÑŸÖ ÿ≤ŸäŸÜ Ÿàÿ™ÿ¥ÿ±ŸäŸÅ ŸÑÿµÿßÿ≠ÿ®Ÿá (ÿßŸÑÿ∑ŸÑŸÅ) ŸÅŸÑŸá ÿ£ÿ®ÿßÿ®ŸÉ ŸàŸÉŸÖ ÿ∞ŸÑŸáŸÖ ŸÖŸÜ ÿßŸÑÿ≥ŸÇÿßŸÇÿß 

* ŸÇÿ∑Ÿá ŸÅŸä ÿßŸÑÿ∑ÿßŸÖŸÖ ÿ£Ÿà ÿ∫ÿ∂ÿØŸáÿß ŸÉÿßŸÑŸáŸÖÿß. ŸàŸÇÿØ ŸäŸÅÿ±Ÿâ ÿπŸÑŸâ ÿßŸÑÿ≠ÿ≥ÿ® ŸàÿßŸÑÿÆŸÑŸÇ ŸÖÿß ŸÑÿß ÿ™ÿ±ÿßŸäŸÜ ŸÇŸÖÿπÿ™ ÿßŸÑÿ∞ÿßÿπ ŸàŸÖÿ≠ÿßÿ¶ŸÑ Ÿàÿ£ÿ≥ŸÑÿßÿ≠ÿ©, ŸàŸÑÿß ÿ™ÿ®ÿ¨Ÿàÿß ÿ®Ÿáÿß ÿµÿ±ÿßÿØ Ÿàÿ¨ŸÜÿßŸÜŸä I" ŸàŸÖŸÜ ŸÑŸÑŸá ŸÑŸá ÿ≤ÿ®ÿßÿ° ÿßŸÑŸÖÿ±ÿ¨ÿßŸÜ Ÿàÿßÿ≤ŸÑŸâ ŸÑŸÜŸäÿ¥ Ÿáÿ∞ÿß ŸÜŸÇŸÑÿ© ÿ®ŸáŸäŸàŸÖ ÿßŸÑÿπÿßÿ±ÿå ŸàŸÉÿ∞ŸÑŸÉ ŸÖŸÑŸÉŸÜÿß ŸÖÿπÿßŸäŸäÿ± ŸÑŸÑÿ¨ÿØ ŸÑŸà ÿ£ŸÜ ÿ£ŸÑŸÅŸäÿßÿ™ ÿßŸÑÿµŸäÿ® ÿ•ŸÑŸâ ÿ£ÿ≠ÿØÿßŸÉŸÜ ÿ•ÿ∞ ÿ∞ÿßŸäÿß ŸÖÿ¨ŸÑÿ© ÿßŸÑŸÅÿßÿ®ÿ±


# 10. Upload LoRA Adapters to HuggingFace

In [1]:
import torch
from unsloth import FastLanguageModel

hf_model_id = "mohamed-hassaneen/arabic-poetry-gemma-3-4b"
local_model_path = "./model"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = local_model_path,
    max_seq_length = 256,
    dtype = None,
    load_in_4bit = True,
)

model.push_to_hub(hf_model_id, token = "XXX")
tokenizer.push_to_hub(hf_model_id, token = "XXX")

print(f"Adapters are LIVE! View here: https://huggingface.co/{hf_model_id}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Gemma3 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA H200 MIG 1g.18gb. Num GPUs = 2. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Saved model to https://huggingface.co/mohamed-hassaneen/arabic-poetry-gemma-3-4b


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Adapters are LIVE! View here: https://huggingface.co/mohamed-hassaneen/arabic-poetry-gemma-3-4b
