Based one

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb

In [1]:
from unsloth import FastModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastModel.from_pretrained(
    #model_name = "unsloth/gemma-3-4b-it",
    #model_name = "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    model_name = "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    Quadro RTX 5000. Num GPUs = 1. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

## Load LoRA adapters

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [3]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [4]:
from datasets import load_dataset
dataset = load_dataset("philipfourie/morse-taylor-swift", split="train")

In [5]:
dataset

Dataset({
    features: ['line', 'morse'],
    num_rows: 15689
})

In [6]:
dataset[0]

{'line': 'Car rides to Malibu',
 'morse': '-.-. .- .-. / .-. .. -.. . ... / - --- / -- .- .-.. .. -... ..-'}

In [7]:
def apply_conversations(example):
    return {
        "conversations": [
            {
                "content": example["line"],
                "role": "user",
            },
            {
                "content": example["morse"],
                "role": "assistant",
            },
        ]
    }
    
 

In [8]:
convos = dataset.map(
    apply_conversations,
    remove_columns = ["line", "morse"],
    #num_proc = 1,
    desc = "Applying conversations",
)
convos

Dataset({
    features: ['conversations'],
    num_rows: 15689
})

In [9]:
convos[0]

{'conversations': [{'content': 'Car rides to Malibu', 'role': 'user'},
  {'content': '-.-. .- .-. / .-. .. -.. . ... / - --- / -- .- .-.. .. -... ..-',
   'role': 'assistant'}]}

In [10]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

convos = convos.map(formatting_prompts_func, remove_columns=["conversations"], batched = True)
convos

Map:   0%|          | 0/15689 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 15689
})

In [11]:
convos["text"][0]

'<start_of_turn>user\nCar rides to Malibu<end_of_turn>\n<start_of_turn>model\n-.-. .- .-. / .-. .. -.. . ... / - --- / -- .- .-.. .. -... ..-<end_of_turn>\n'

In [12]:
# split the dataset into train and test
train_dataset = convos.train_test_split(test_size=0.1, seed=42)["train"]
test_dataset = convos.train_test_split(test_size=0.1, seed=42)["test"]

In [13]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
        dataset_num_proc=2,
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/14120 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1569 [00:00<?, ? examples/s]

In [14]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/14120 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/1569 [00:00<?, ? examples/s]

In [15]:
trainer.train_dataset[100]["input_ids"]

[2,
 105,
 2364,
 107,
 59591,
 11115,
 528,
 506,
 5312,
 106,
 107,
 105,
 4368,
 107,
 1390,
 783,
 726,
 783,
 783,
 753,
 965,
 753,
 783,
 199900,
 965,
 2728,
 72213,
 965,
 753,
 27103,
 783,
 965,
 3729,
 2728,
 236772,
 2617,
 2617,
 783,
 783,
 24975,
 106,
 107]

In [16]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'              ... .-- . . - / - . .- / .. -. / - .... . / ... ..- -- -- . .-.<end_of_turn>\n'

In [17]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Quadro RTX 5000. Max memory = 16.0 GB.
1.512 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 14,120 | Num Epochs = 2 | Total steps = 1,764
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.0255
2,3.2535
3,3.1644
4,3.74
5,3.8023
6,3.5653
7,3.6519
8,3.4641
9,3.0882
10,2.8532


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "And tradin jackets",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize= False,
    add_generation_prompt = True, # Must add for generation
)

print(text)

outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
print(tokenizer.batch_decode(outputs))

<bos><start_of_turn>user
And tradin jackets<end_of_turn>
<start_of_turn>model



['<bos><bos><start_of_turn>user\nAnd tradin jackets<end_of_turn>\n<start_of_turn>model\n-.--.-. -.. -. . .-.. -. . - .... ..-. -. . . /.. -. / .--.. --..-- --- -.--.-. / -- .. .- -. --. .-.. -- ... .-- .. . -.-- /.-.. --- ... --- .-- . .-. .. .-.']