In [1]:
%%capture
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
!pip install unsloth
!pip install transformers==4.53.0
!pip install --no-deps --upgrade timm # Only for Gemma 3N

In [2]:
import torch
from unsloth import FastModel
from IPython.display import Audio, display

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model_name = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"

In [4]:
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit",
    dtype = None, # None for auto detection
    max_seq_length = 1024, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.6.12: Fast Gemma3N patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# url = "https://lh3.googleusercontent.com/gps-cs-s/AC9h4npv6l1MBNQ7vjs1BSwT105wPGE8GtZF6mUnsDIsrjLHVWDipuFTO3L_5k1O8jrVovw3zCNOH-PiX9c0UP5cQPAIH9oGh7Z6-SIpH-358iWGTFD2O90jk6XAO0REQNP-C8kLAikp=s1360-w1360-h1020-rw"

# messages = [
#     {
#         "role": "system",
#         "content": [{"type": "text", "text": "You are a helpful assistant."}]
#     },
#     {
#         "role": "user",
#         "content": [
#             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
#             {"type": "text", "text": "What animal is on the candy?"}
#         ]
#     }
# ]
# messages = [{
#     "role" : "user",
#     "content": [
#         { "type": "image", "image" : url },
#         { "type": "text",  "text" : "What is that building" }
#     ]
# }]
# messages = [{
#     "role": "user",
#     "content": [{ "type" : "text",
#                   "text" : "ما هي عاصمة موريتانيا؟" }]
# }]

# do_gemma_3n_inference(messages, max_new_tokens = 256)

In [7]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = True,
    finetune_language_layers   = True,
    finetune_attention_modules = True,   # Apply LoRA to attention
    finetune_mlp_modules       = True,   # Apply LoRA to MLP blocks

    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    random_state = 42,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [8]:
from datasets import load_dataset
import json
from transformers import AutoTokenizer
from unsloth.chat_templates import apply_chat_template  # Optional direct use
import random

# Load TyDi QA (Gold Passage version)
dataset = load_dataset("tydiqa", "secondary_task")  # Gold Passage

def tydi_to_chat_format(example):
    question = example["question"].strip()
    context = example["context"].strip()
    answer = example["answers"]["text"][0].strip()

    user_msg = f"Question: {question}\nContext: {context}"

    return {
        "conversations": [
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": answer}
        ]
    }

chat_dataset = dataset["train"].map(tydi_to_chat_format)

# Step 3: Format using the model's native chat template
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix("<bos>")
        for convo in convos
    ]
    return { "text": texts }

chat_dataset = chat_dataset.map(formatting_prompts_func, batched=True)

chat_dataset = chat_dataset.remove_columns([
    col for col in chat_dataset.column_names if col != "text"
])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 49881
})
{'text': '<start_of_turn>user\nQuestion: متى تم إكتشاف مرض السرطان؟\nContext: تم اكتشاف اقدم وصف معروف وعلاج جراحي للسرطان في مصر وهو يعود لتاريخ مايقرب من الـ 1600 قبل الميلاد. تصف ورقة البردي 8 حالات من قرحة الثدي التي كانت تعالج بالكي مع أداة تسمى "حفر النار" . الكتابة الموجودة على الورقة تقول حول هذا المرض بأنه "لا يوجد علاج."[3]<end_of_turn>\n<start_of_turn>model\n1600 قبل الميلاد<end_of_turn>\n'}


In [10]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = chat_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/49881 [00:00<?, ? examples/s]

In [11]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/49881 [00:00<?, ? examples/s]

In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 49,881 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 10,567,680 of 2,000,000,000 (0.53% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.