# Gemma-3-270M Arabic Poetry Fine-Tuning

### 1. Install Dependencies

In [1]:
!pip install -q transformers datasets peft accelerate trl huggingface_hub

## 2. Config

In [14]:
from google.colab import userdata

HF_TOKEN = userdata.get("HF_TOKEN")

CONFIG = {
    # Model
    "model_id" : "google/gemma-3-270m",
    "hf_repo" : "mohamed-hassaneen/gemma3-arabic-poetry",

    # Data
    "train_path" : "/content/drive/MyDrive/arabic_poetry/train.jsonl",
    "val_path" : "/content/drive/MyDrive/arabic_poetry/val.jsonl",
    "output_dir" : "/content/drive/MyDrive/arabic_poetry/checkpoints",
    "max_length" : 64,

    # LoRA Hyperparams
    "lora_r" : 16,
    "lora_alpha" : 32,
    "lora_dropout" : 0.05,
    "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],

    # Training Hyperparams
    "epochs" : 1,
    "batch_size" : 16,
    "grad_accum" : 1,
    "learning_rate" : 5e-5,
    "weight_decay" : 0.01,
    "warmup_steps" : 0,
    "lr_scheduler" : "linear",
    "save_total_limit": 2,

    # Decoding Hyperparams
    "num_beams" : 20,
    "top_k" : 50,
    "top_p" : 0.92,
    "temperature" : 1.0,
    "max_new_tokens": 200,
}


## 3. Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 4. Check GPU

In [4]:
import torch

if torch.cuda.is_available():
    gpu = torch.cuda.get_device_name(0)
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU  : {gpu}')
    print(f'VRAM : {vram:.1f} GB')
else:
    print('No GPU found')

DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
USE_BF16 = torch.cuda.is_bf16_supported()
print(f'dtype: {"bfloat16" if USE_BF16 else "float16"}')

GPU  : Tesla T4
VRAM : 15.6 GB
dtype: bfloat16


## 5. Load & Inspect Data

In [5]:
import json
import re

def load_jsonl(path):
    with open(path) as f:
        return [json.loads(line) for line in f if line.strip()]

train_data = load_jsonl(CONFIG["train_path"])
val_data   = load_jsonl(CONFIG["val_path"])

print(f'Train poems : {len(train_data):,}')
print(f'Val poems : {len(val_data):,}')
print(f'\n--- Sample poem (first 3 verses) ---')
for line in train_data[0]['poem'].split('\n')[:3]:
    print(line)

Train poems : 55,523
Val poems : 1,424

--- Sample poem (first 3 verses) ---
[ل] ألم تلمم على الطلل المحيل [الوافر] بغربي الأبارق من حقيل
[ل] صرفت بصاحبي طربا إليها [الوافر] وما طرب الحليم إلى الطلول
[ل] فلم أر غير آناء أحاطت [الوافر] على العرصات من حذر السيول


## 6. Define Special Tokens

In [6]:
# The 16 classical Arabic meters
meter_tokens = [
    '[الطويل]', '[الكامل]', '[الوافر]', '[البسيط]',
    '[الخفيف]', '[الهزج]',  '[الرجز]',  '[الرمل]',
    '[المتقارب]', '[المنسرح]', '[المجتث]', '[المديد]',
    '[السريع]', '[المضارع]', '[المقتضب]', '[المتدارك]',
]

# All Arabic letters as rhyme tokens
rhyme_tokens = [
    '[ا]','[ب]','[ت]','[ث]','[ج]','[ح]','[خ]',
    '[د]','[ذ]','[ر]','[ز]','[س]','[ش]','[ص]',
    '[ض]','[ط]','[ظ]','[ع]','[غ]','[ف]','[ق]',
    '[ك]','[ل]','[م]','[ن]','[ه]','[و]','[ي]',
    '[ء]'
]

ALL_SPECIAL = meter_tokens + rhyme_tokens

print(f'Meter tokens ({len(meter_tokens)}): {meter_tokens}')
print(f'Rhyme tokens ({len(rhyme_tokens)}): {rhyme_tokens}')
print(f'Total special tokens: {len(ALL_SPECIAL)}')


Meter tokens (16): ['[الطويل]', '[الكامل]', '[الوافر]', '[البسيط]', '[الخفيف]', '[الهزج]', '[الرجز]', '[الرمل]', '[المتقارب]', '[المنسرح]', '[المجتث]', '[المديد]', '[السريع]', '[المضارع]', '[المقتضب]', '[المتدارك]']
Rhyme tokens (29): ['[ا]', '[ب]', '[ت]', '[ث]', '[ج]', '[ح]', '[خ]', '[د]', '[ذ]', '[ر]', '[ز]', '[س]', '[ش]', '[ص]', '[ض]', '[ط]', '[ظ]', '[ع]', '[غ]', '[ف]', '[ق]', '[ك]', '[ل]', '[م]', '[ن]', '[ه]', '[و]', '[ي]', '[ء]']
Total special tokens: 45


## 7. Load Tokenizer & Add Special Tokens

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_id"], token=HF_TOKEN)

tokenizer.add_special_tokens({"additional_special_tokens": ALL_SPECIAL})
tokenizer.pad_token = tokenizer.eos_token

print(f"Vocab size after adding tokens: {len(tokenizer):,}")


Vocab size after adding tokens: 262,190


## 8. Load Model & Apply LoRA

In [8]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_id"],
    torch_dtype=DTYPE,
    device_map="auto",
    token=HF_TOKEN,
    attn_implementation="sdpa"
)

model.resize_token_embeddings(len(tokenizer))

lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=CONFIG["lora_target_modules"],
    bias="none",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/236 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 1,474,560 || all params: 269,602,176 || trainable%: 0.5469


## 9. Build Dataset

In [10]:
import random
from datasets import Dataset

def make_dataset(records):
    return Dataset.from_dict({"poem": [r["poem"] for r in records]})

# Use 10% of training data
random.seed(42)
sample_size = int(len(train_data) * 0.10)
train_sample = random.sample(train_data, sample_size)

train_ds = make_dataset(train_sample)
val_ds   = make_dataset(val_data)

print(f"Train poems (sampled): {len(train_ds):,}")
print(f"Val poems: {len(val_ds):,}")


Train poems (sampled): 5,552
Val poems: 1,424


## 10. Train

In [11]:
from trl import SFTTrainer, SFTConfig

model.gradient_checkpointing_enable()

sft_config = SFTConfig(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["grad_accum"],
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    lr_scheduler_type=CONFIG["lr_scheduler"],
    warmup_steps=CONFIG["warmup_steps"],
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=CONFIG["save_total_limit"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=USE_BF16,
    fp16=not USE_BF16,
    logging_steps=50,
    report_to="none",
    packing=True,
    max_length=CONFIG["max_length"],
    dataset_text_field="poem",
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
)

trainer.train()




Adding EOS to train dataset:   0%|          | 0/5552 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5552 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/5552 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1424 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1424 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/1424 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 1}.


Epoch,Training Loss,Validation Loss
1,5.447407,5.449018




TrainOutput(global_step=3603, training_loss=5.533358084503691, metrics={'train_runtime': 4315.1617, 'train_samples_per_second': 13.357, 'train_steps_per_second': 0.835, 'total_flos': 2247741447993600.0, 'train_loss': 5.533358084503691})

## 11. Evaluate Perplexity

In [12]:
import math

results = trainer.evaluate()
ppl = math.exp(results['eval_loss'])
print(f'Eval Loss: {results["eval_loss"]:.4f}')
print(f'Perplexity: {ppl:.2f}')

Eval Loss: 5.4490
Perplexity: 232.53


## 12. Test Generation

In [15]:
from transformers import pipeline

# Merge LoRA weights back into the base model for inference
merged = model.merge_and_unload()
merged.eval()

gen = pipeline(
    "text-generation",
    model=merged,
    tokenizer=tokenizer,
    device_map="auto",
)

target_meter = "[البسيط]"
prompt = "العلم زين وتشريف لصاحبه"

# Block all meter tokens except the target to force a specific meter
blocked_ids = [
    [tokenizer.convert_tokens_to_ids(t)]
    for t in meter_tokens if t != target_meter
]

output = gen(
    prompt,
    max_new_tokens=CONFIG["max_new_tokens"],
    num_beams=CONFIG["num_beams"],
    do_sample=True,
    top_k=CONFIG["top_k"],
    top_p=CONFIG["top_p"],
    temperature=CONFIG["temperature"],
    bad_words_ids=blocked_ids,
    pad_token_id=tokenizer.eos_token_id,
)

print(output[0]["generated_text"])


Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


العلم زين وتشريف لصاحبه 1000 2000 3000 4000 5000 100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


## 13. Push to HuggingFace Hub

In [17]:
merged.push_to_hub(CONFIG["hf_repo"], token=HF_TOKEN, private=False)
tokenizer.push_to_hub(CONFIG["hf_repo"], token=HF_TOKEN, private=False)

print(f"Done! Model live at: https://huggingface.co/{CONFIG['hf_repo']}")


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ij5i8ti/model.safetensors:   8%|7         | 42.0MB /  536MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mp68bcz01m/tokenizer.json:  99%|#########9| 33.1MB / 33.4MB            

Done! Model live at: https://huggingface.co/mohamed-hassaneen/gemma3-arabic-poetry
